diff --git a/README.md b/README.md
index b6300d9b6ee..7f570e52f78 100644
--- a/README.md
+++ b/README.md
@@ -56,11 +56,10 @@ releases of the toolkit.
* Java 17 is needed to run or build GATK.
We recommend one of the following:
* Download the Eclipse Foundation's distribution of OpenJDK 17 from [adoptium.net](https://adoptium.net/). Navigate to the [release archive](https://adoptium.net/temurin/archive/?version=17) to find downloads for Java 17.
- * On Mac OS, you can install the [Homebrew package manager](https://brew.sh/) and run `brew tap homebrew/cask-versions` followed by `brew install --cask temurin17` to install the Eclipse Foundation's OpenJDK 17.
- * Python 2.6 or greater (required to run the `gatk` frontend script)
- * Python 3.6.2, along with a set of additional Python packages, is required to run some tools and workflows.
+ * On Mac OS, you can install the [Homebrew package manager](https://brew.sh/) and run `brew install temurin@17` to install the Eclipse Foundation's OpenJDK 17.
+ * Python 3.10.13, along with a set of additional Python packages, is required to run some tools and workflows (also required to run the `gatk` frontend script).
See [Python Dependencies](#python) for more information.
- * R 3.2.5 (needed for producing plots in certain tools)
+ * R 4.3.1 (needed for producing plots in certain tools)
* To build GATK:
* A Java 17 JDK
* Git 2.5 or greater
@@ -73,7 +72,7 @@ releases of the toolkit.
the size of the download.
* Gradle 5.6. We recommend using the `./gradlew` script which will
download and use an appropriate gradle version automatically (see examples below).
- * R 3.2.5 (needed for running the test suite)
+ * R 4.3.1 (needed for running the test suite)
* Pre-packaged Docker images with all needed dependencies installed can be found on
[our dockerhub repository](https://hub.docker.com/r/broadinstitute/gatk/). This requires a recent version of the
docker client, which can be found on the [docker website](https://www.docker.com/get-docker).
@@ -141,10 +140,10 @@ Our docker image contains the following bioinformatics tools, which can be run b
* bcftools (1.13)
* tabix (1.13+ds)
-We also include an installation of Python3 (3.6.10) with the following popular packages included:
+We also include an installation of Python3 (3.10.13) with the following popular packages included:
* numpy
* scipy
-* tensorflow
+* pytorch
* pymc3
* keras
* scikit-learn
@@ -154,7 +153,7 @@ We also include an installation of Python3 (3.6.10) with the following popular p
* pyvcf
* pysam
-We also include an installation of R (3.6.2) with the following popular packages included:
+We also include an installation of R (4.3.1) with the following popular packages included:
* data.table
* dplyr
* ggplot2
@@ -203,7 +202,7 @@ For more details on system packages, see the GATK [Base Dockerfile](scripts/dock
## Running GATK4
* The standard way to run GATK4 tools is via the **`gatk`** wrapper script located in the root directory of a clone of this repository.
- * Requires Python 2.6 or greater (this includes Python 3.x)
+ * Requires Python 3.9 or greater
* You need to have built the GATK as described in the [Building GATK4](#building) section above before running this script.
* There are several ways `gatk` can be run:
* Directly from the root of your git clone after building
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
index 5224fe5f7d6..5f1a2dc17ab 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
@@ -13,6 +13,7 @@ public final class GATKSVVCFConstants {
// VCF standard keys reserved for sv
public static final String SVTYPE = "SVTYPE";
public static final String SVLEN = "SVLEN";
+ public static final String EVIDENCE = "EVIDENCE";
public static final String IMPRECISE = "IMPRECISE";
public static final String CIPOS = "CIPOS";
public static final String CIEND = "CIEND";
@@ -31,6 +32,14 @@ public final class GATKSVVCFConstants {
public static final Allele DEL_ALLELE = Allele.create("", false);
public static final Allele DUP_ALLELE = Allele.create("", false);
+ // Evidence types
+ public enum EvidenceTypes {
+ BAF,
+ PE,
+ RD,
+ SR
+ }
+
// GATK-SV specific header lines
// TODO: 10/3/17 the following comment is a goal we are trying to achieve
// applicable to all records all the time
@@ -136,8 +145,13 @@ public enum ComplexVariantSubtype {
public static final String BND_DELETION_STRANDS = "+-";
public static final String BND_DUPLICATION_STRANDS = "-+";
+ // SR support
+ public static final String BOTHSIDES_SUPPORT_ATTRIBUTE = "BOTHSIDES_SUPPORT";
+ public static final String HIGH_SR_BACKGROUND_ATTRIBUTE = "HIGH_SR_BACKGROUND";
+
// format block
public static final String COPY_NUMBER_FORMAT = "CN";
+ public static final String DEPTH_GENOTYPE_COPY_NUMBER_FORMAT = "RD_CN";
public static final String EXPECTED_COPY_NUMBER_FORMAT = "ECN";
public static final String COPY_NUMBER_QUALITY_FORMAT = "CNQ";
@@ -175,6 +189,9 @@ public enum ComplexVariantSubtype {
public static final String TRUTH_ALLELE_NUMBER_INFO = "TRUTH_AN";
public static final String TRUTH_ALLELE_FREQUENCY_INFO = "TRUTH_AF";
+ // stratification
+ public static final String STRATUM_INFO_KEY = "STRAT";
+
// functional annotations
public static final String LOF = "PREDICTED_LOF";
public static final String INT_EXON_DUP = "PREDICTED_INTRAGENIC_EXON_DUP";
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java
index 3f3258d6161..3b0466f4bd6 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java
@@ -21,6 +21,7 @@
import java.util.stream.Stream;
import static org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants.COPY_NUMBER_FORMAT;
+import static org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants.DEPTH_GENOTYPE_COPY_NUMBER_FORMAT;
public class SVCallRecord implements SVLocatable {
@@ -31,6 +32,7 @@ public class SVCallRecord implements SVLocatable {
VCFConstants.END_KEY,
GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE,
GATKSVVCFConstants.SVLEN,
+ GATKSVVCFConstants.EVIDENCE,
GATKSVVCFConstants.CONTIG2_ATTRIBUTE,
GATKSVVCFConstants.END2_ATTRIBUTE,
GATKSVVCFConstants.STRANDS_ATTRIBUTE,
@@ -48,6 +50,7 @@ public class SVCallRecord implements SVLocatable {
private final Boolean strandB;
private final GATKSVVCFConstants.StructuralVariantAnnotationType type;
private final Integer length;
+ private final List evidence;
private final List algorithms;
private final List alleles;
private final Allele refAllele;
@@ -72,6 +75,7 @@ public SVCallRecord(final String id,
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype,
final List cpxIntervals,
final Integer length,
+ final List evidence,
final List algorithms,
final List alleles,
final List genotypes,
@@ -79,7 +83,7 @@ public SVCallRecord(final String id,
final Set filters,
final Double log10PError,
final SAMSequenceDictionary dictionary) {
- this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, cpxIntervals, length, algorithms, alleles, genotypes, attributes, filters, log10PError);
+ this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, cpxIntervals, length, evidence, algorithms, alleles, genotypes, attributes, filters, log10PError);
validateCoordinates(dictionary);
}
@@ -94,6 +98,7 @@ protected SVCallRecord(final String id,
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype,
final List cpxIntervals,
final Integer length,
+ final List evidence,
final List algorithms,
final List alleles,
final List genotypes,
@@ -106,6 +111,7 @@ protected SVCallRecord(final String id,
Utils.nonNull(attributes);
Utils.nonNull(filters);
Utils.nonNull(cpxIntervals);
+ Utils.nonNull(evidence);
this.id = Utils.nonNull(id);
this.contigA = contigA;
this.positionA = positionA;
@@ -123,6 +129,7 @@ protected SVCallRecord(final String id,
this.genotypes = GenotypesContext.copy(genotypes).immutable();
this.attributes = validateAttributes(attributes);
this.length = inferLength(type, positionA, positionB, length);
+ this.evidence = evidence;
final Pair strands = inferStrands(type, strandA, strandB);
this.strandA = strands.getLeft();
this.strandB = strands.getRight();
@@ -272,7 +279,8 @@ private boolean isCarrier(final Genotype genotype) {
}
// Otherwise, try to infer status if it's a biallelic CNV with a copy number call
- final int copyNumber = VariantContextGetters.getAttributeAsInt(genotype, COPY_NUMBER_FORMAT, expectedCopyNumber);
+ final int copyNumber = VariantContextGetters.getAttributeAsInt(genotype, COPY_NUMBER_FORMAT,
+ VariantContextGetters.getAttributeAsInt(genotype, DEPTH_GENOTYPE_COPY_NUMBER_FORMAT, expectedCopyNumber));
if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.DEL) {
return copyNumber < expectedCopyNumber;
} else if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.DUP) {
@@ -370,6 +378,10 @@ public Integer getLength() {
return length;
}
+ public List getEvidence() {
+ return evidence;
+ }
+
public List getAlgorithms() {
return algorithms;
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java
index cf31d654727..4a13d62119e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java
@@ -18,6 +18,7 @@
import java.util.stream.Collectors;
import java.util.stream.Stream;
+import static htsjdk.variant.vcf.VCFConstants.MISSING_VALUE_v4;
import static org.broadinstitute.hellbender.tools.sv.SVCallRecord.UNDEFINED_LENGTH;
public final class SVCallRecordUtils {
@@ -91,6 +92,9 @@ public static VariantContextBuilder getVariantBuilder(final SVCallRecord record)
&& record.getStrandA() != null && record.getStrandB() != null) {
builder.attribute(GATKSVVCFConstants.STRANDS_ATTRIBUTE, getStrandString(record));
}
+ if (!record.getEvidence().isEmpty()) {
+ builder.attribute(GATKSVVCFConstants.EVIDENCE, record.getEvidence());
+ }
if (!record.getFilters().isEmpty()) {
builder.filters(record.getFilters());
}
@@ -173,12 +177,12 @@ public static GenotypesContext populateGenotypesForMissingSamplesWithAlleles(fin
*/
public static SVCallRecord copyCallWithNewGenotypes(final SVCallRecord record, final GenotypesContext genotypes) {
return new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), record.getStrandA(), record.getContigB(),
- record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
+ record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getEvidence(), record.getAlgorithms(), record.getAlleles(),
genotypes, record.getAttributes(), record.getFilters(), record.getLog10PError());
}
public static SVCallRecord copyCallWithNewAttributes(final SVCallRecord record, final Map attr) {
return new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), record.getStrandA(), record.getContigB(),
- record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
+ record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getEvidence(), record.getAlgorithms(), record.getAlleles(),
record.getGenotypes(), attr, record.getFilters(), record.getLog10PError());
}
@@ -291,10 +295,10 @@ public static Stream convertInversionsToBreakends(final SVCallReco
Utils.validateArg(record.isIntrachromosomal(), "Inversion " + record.getId() + " is not intrachromosomal");
final SVCallRecord positiveBreakend = new SVCallRecord(record.getId(), record.getContigA(),
record.getPositionA(), true, record.getContigB(), record.getPositionB(), true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,record.getComplexEventIntervals(), null,
- record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
+ record.getEvidence(), record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
final SVCallRecord negativeBreakend = new SVCallRecord(record.getId(), record.getContigA(),
record.getPositionA(), false, record.getContigB(), record.getPositionB(), false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,record.getComplexEventIntervals(), null,
- record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
+ record.getEvidence(), record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
return Stream.of(positiveBreakend, negativeBreakend);
}
@@ -319,8 +323,9 @@ public static SVCallRecord create(final VariantContext variant, boolean keepVari
final GATKSVVCFConstants.StructuralVariantAnnotationType type = inferStructuralVariantType(variant);
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype = getComplexSubtype(variant);
- final List cpxIntervals = parseComplexIntervals(variant.getAttributeAsStringList(GATKSVVCFConstants.CPX_INTERVALS, null), dictionary);
+ final List cpxIntervals = parseComplexIntervals(variant, dictionary);
final List algorithms = getAlgorithms(variant);
+ final List evidence = getEvidence(variant);
final String strands;
if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.DEL
@@ -375,12 +380,13 @@ public static SVCallRecord create(final VariantContext variant, boolean keepVari
final Map sanitizedAttributes = sanitizeAttributes(attributes);
return new SVCallRecord(id, contigA, positionA, strand1, contigB, positionB, strand2, type, cpxSubtype,
- cpxIntervals, length, algorithms, variant.getAlleles(), variant.getGenotypes(), sanitizedAttributes,
+ cpxIntervals, length, evidence, algorithms, variant.getAlleles(), variant.getGenotypes(), sanitizedAttributes,
variant.getFilters(), log10PError);
}
- private static List parseComplexIntervals(final List intervals, final SAMSequenceDictionary dictionary) {
- return intervals.stream().map(i -> SVCallRecord.ComplexEventInterval.decode(i, dictionary)).toList();
+ private static List parseComplexIntervals(final VariantContext variant, final SAMSequenceDictionary dictionary) {
+ return variant.getAttributeAsStringList(GATKSVVCFConstants.CPX_INTERVALS, null).stream()
+ .map(i -> SVCallRecord.ComplexEventInterval.decode(i, dictionary)).toList();
}
private static Map sanitizeAttributes(final Map attributes) {
@@ -402,6 +408,19 @@ private static Integer getLength(final VariantContext variant, final GATKSVVCFCo
return length;
}
+ public static List getEvidence(final VariantContext variant) {
+ Utils.nonNull(variant);
+ final List value = variant.getAttributeAsStringList(GATKSVVCFConstants.EVIDENCE, null);
+ if (value == null) {
+ return Collections.emptyList();
+ } else {
+ return value.stream()
+ .filter(v -> v != null && !v.equals(MISSING_VALUE_v4))
+ .map(GATKSVVCFConstants.EvidenceTypes::valueOf)
+ .collect(Collectors.toList());
+ }
+ }
+
public static List getAlgorithms(final VariantContext variant) {
Utils.nonNull(variant);
Utils.validateArg(variant.hasAttribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE), "Expected " + GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE + " field for variant " + variant.getID());
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java
index 39228617d26..ee6e140b793 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java
@@ -23,6 +23,7 @@
import java.util.*;
import java.util.stream.Collectors;
+import java.util.stream.Stream;
/**
* Class for collapsing a collection of similar {@link SVCallRecord} objects, such as clusters produced by
@@ -79,6 +80,32 @@ public enum AltAlleleSummaryStrategy {
}
+ /**
+ * Flag field logic
+ */
+ public enum FlagFieldLogic {
+ /**
+ * Require all members to have the flag set
+ */
+ AND,
+
+ /**
+ * Require at least one member to have the flag set
+ */
+ OR,
+
+ /**
+ * Always set to false
+ */
+ ALWAYS_FALSE
+
+ }
+
+ public static final Set FLAG_TYPE_INFO_FIELDS = Sets.newHashSet(
+ GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE,
+ GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE
+ );
+
private static final Set SUPPORTED_SV_TYPES = Sets.newHashSet(
GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
GATKSVVCFConstants.StructuralVariantAnnotationType.DUP,
@@ -90,33 +117,40 @@ public enum AltAlleleSummaryStrategy {
GATKSVVCFConstants.StructuralVariantAnnotationType.CTX
);
+ private static final BreakpointEvidenceComparator breakpointEvidenceComparator = new BreakpointEvidenceComparator();
+
/**
* Comparators used for picking the representative genotype for a given sample
*/
+ // Priotize non-ref over ref
final Comparator genotypeIsNonRefComparator = (o1, o2) -> {
final long count1 = Math.min(1, o1.getAlleles().stream().filter(Allele::isNonReference).filter(Allele::isCalled).count());
final long count2 = Math.min(1, o2.getAlleles().stream().filter(Allele::isNonReference).filter(Allele::isCalled).count());
return Long.compare(count1, count2);
};
+ // Priotize fewer ALT alleles over more. When applied after non-ref comparator, hom-ref genotypes will not be encountered.
final Comparator genotypeNonRefCountComparator = (o1, o2) -> {
final long count1 = o1.getAlleles().stream().filter(Allele::isNonReference).filter(Allele::isCalled).count();
final long count2 = o2.getAlleles().stream().filter(Allele::isNonReference).filter(Allele::isCalled).count();
- return Long.compare(count1, count2);
+ return Long.compare(count2, count1);
};
+ // Priotize called genotypes
final Comparator genotypeCalledComparator = (o1, o2) -> {
final long count1 = o1.getAlleles().stream().filter(Allele::isCalled).count();
final long count2 = o2.getAlleles().stream().filter(Allele::isCalled).count();
return Long.compare(count1, count2);
};
+ // Priotize higher quality
final Comparator genotypeQualityComparator = (o1, o2) -> {
final int quality1 = VariantContextGetters.getAttributeAsInt(o1, VCFConstants.GENOTYPE_QUALITY_KEY, 0);
final int quality2 = VariantContextGetters.getAttributeAsInt(o2, VCFConstants.GENOTYPE_QUALITY_KEY, 0);
return Integer.compare(quality1, quality2);
};
+ // Priotize higher depth genotyping quality
final Comparator genotypeCopyNumberQualityComparator = new Comparator() {
@Override
public int compare(Genotype o1, Genotype o2) {
@@ -126,6 +160,7 @@ public int compare(Genotype o1, Genotype o2) {
}
};
+ // Priotize depth genotypes closer to reference
final Comparator genotypeCopyNumberComparator = new Comparator() {
@Override
public int compare(Genotype o1, Genotype o2) {
@@ -133,22 +168,43 @@ public int compare(Genotype o1, Genotype o2) {
final int copyNumber1 = VariantContextGetters.getAttributeAsInt(o1, GATKSVVCFConstants.COPY_NUMBER_FORMAT, 0);
final int expectedQualityNumber2 = VariantContextGetters.getAttributeAsInt(o2, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 0);
final int copyNumber2 = VariantContextGetters.getAttributeAsInt(o2, GATKSVVCFConstants.COPY_NUMBER_FORMAT, 0);
- return Double.compare(Math.abs(expectedQualityNumber1 - copyNumber1), Math.abs(expectedQualityNumber2 - copyNumber2));
+ return Double.compare(Math.abs(expectedQualityNumber2 - copyNumber2), Math.abs(expectedQualityNumber1 - copyNumber1));
+ }
+ };
+
+ // Priotize DEL over DUP as final tiebreaker
+ final Comparator genotypeDelOverDupComparator = new Comparator() {
+ @Override
+ public int compare(Genotype o1, Genotype o2) {
+ final int expectedCN1 = VariantContextGetters.getAttributeAsInt(o1, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 0);
+ final boolean isDel1 = VariantContextGetters.getAttributeAsInt(o1, GATKSVVCFConstants.COPY_NUMBER_FORMAT, expectedCN1) < expectedCN1;
+ final int expectedCN2 = VariantContextGetters.getAttributeAsInt(o2, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 0);
+ final boolean isDel2 = VariantContextGetters.getAttributeAsInt(o2, GATKSVVCFConstants.COPY_NUMBER_FORMAT, expectedCN2) < expectedCN2;
+ if (isDel1 && !isDel2) {
+ return 1;
+ } else if (isDel2 && !isDel1) {
+ return -1;
+ } else {
+ return 0;
+ }
}
};
private final AltAlleleSummaryStrategy altAlleleSummaryStrategy;
private final BreakpointSummaryStrategy breakpointSummaryStrategy;
+ private final FlagFieldLogic flagFieldLogic;
private final ReferenceSequenceFile reference;
private final SAMSequenceDictionary dictionary;
public CanonicalSVCollapser(final ReferenceSequenceFile reference,
final AltAlleleSummaryStrategy altAlleleSummaryStrategy,
- final BreakpointSummaryStrategy breakpointSummaryStrategy) {
+ final BreakpointSummaryStrategy breakpointSummaryStrategy,
+ final FlagFieldLogic flagFieldLogic) {
this.reference = Utils.nonNull(reference);
this.dictionary = reference.getSequenceDictionary();
this.altAlleleSummaryStrategy = altAlleleSummaryStrategy;
this.breakpointSummaryStrategy = breakpointSummaryStrategy;
+ this.flagFieldLogic = flagFieldLogic;
}
private static final int distance(final SVCallRecord item, final int newStart, final int newEnd) {
@@ -193,7 +249,7 @@ public SVCallRecord collapse(final SVClusterEngine.OutputCluster cluster) {
return new SVCallRecord(representative.getId(), representative.getContigA(), start, strandA, representative.getContigB(),
end, strandB, type, representative.getComplexSubtype(), representative.getComplexEventIntervals(),
- length, algorithms, alleles, genotypes, attributes, filters, quality, dictionary);
+ length, representative.getEvidence(), algorithms, alleles, genotypes, attributes, filters, quality, dictionary);
}
protected List collapseAlleles(final List altAlleles, final Allele refAllele) {
@@ -429,7 +485,8 @@ protected Genotype getRepresentativeGenotype(final Collection genotype
.thenComparing(genotypeQualityComparator)
.thenComparing(genotypeNonRefCountComparator)
.thenComparing(genotypeCopyNumberQualityComparator)
- .thenComparing(genotypeCopyNumberComparator)).get();
+ .thenComparing(genotypeCopyNumberComparator)
+ .thenComparing(genotypeDelOverDupComparator)).get();
}
@@ -562,15 +619,37 @@ public static List makeBiallelicList(final Allele alt, final Allele ref,
return alleles;
}
+ private Stream getItemFlagStream(final String key, final Collection items) {
+ return items.stream()
+ .map(item ->item.getAttributes().get(key) != null && item.getAttributes().get(key).equals(Boolean.TRUE));
+ }
+
protected Map collapseAttributes(final SVCallRecord representative,
final Collection items) {
Utils.nonNull(items);
Utils.nonEmpty(items);
final Map attributes = new HashMap<>();
for (final Map.Entry entry : representative.getAttributes().entrySet()) {
- attributes.put(entry.getKey(), entry.getValue());
+ if (!FLAG_TYPE_INFO_FIELDS.contains(entry.getKey())) {
+ attributes.put(entry.getKey(), entry.getValue());
+ }
}
attributes.put(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, items.stream().map(SVCallRecord::getId).sorted().collect(Collectors.toList()));
+ for (final String key : FLAG_TYPE_INFO_FIELDS) {
+ if (flagFieldLogic == FlagFieldLogic.AND) {
+ if (getItemFlagStream(key, items).allMatch(Boolean::booleanValue)) {
+ attributes.put(key, Boolean.TRUE);
+ }
+ } else if (flagFieldLogic == FlagFieldLogic.OR) {
+ if (getItemFlagStream(key, items).anyMatch(Boolean::booleanValue)) {
+ attributes.put(key, Boolean.TRUE);
+ }
+ } else if (flagFieldLogic == FlagFieldLogic.ALWAYS_FALSE) {
+ // Leave empty to imply FALSE
+ } else {
+ throw new IllegalArgumentException("Unsupported " + FlagFieldLogic.class.getSimpleName() + " value: " + flagFieldLogic.name());
+ }
+ }
return attributes;
}
@@ -671,16 +750,45 @@ private SVCallRecord getRepresentativeIntervalItem(final Collection qualityComparator = Comparator.comparing(r -> r.getLog10PError() == null ? 0 : r.getLog10PError());
final Comparator carrierCountComparator = Comparator.comparing(r -> -r.getCarrierGenotypeList().size());
final Comparator distanceComparator = Comparator.comparing(r -> getDistance(r.getPositionA(), r.getPositionB(), starts, ends));
- final Comparator idComparator = Comparator.comparing(r -> getDistance(r.getPositionA(), r.getPositionB(), starts, ends)); // stabilizes order
+ final Comparator idComparator = Comparator.comparing(SVCallRecord::getId); // stabilizes order
return records.stream().min(
- carrierCountComparator
+ qualityComparator
+ .thenComparing(breakpointEvidenceComparator)
+ .thenComparing(carrierCountComparator)
.thenComparing(distanceComparator)
.thenComparing(idComparator)).get();
}
+ /***
+ * This class is for comparing evidence types for the purposes of breakpoint refinement. It prioritizes as follows:
+ * SR < PE < all other types. Note that SR is the "best" evidence but corresponds to the "least" value when sorting
+ * in ascending order.
+ */
+ protected static class BreakpointEvidenceComparator implements Comparator {
+ @Override
+ public int compare(final SVCallRecord a, final SVCallRecord b) {
+ final Set evidenceA = new HashSet<>(a.getEvidence());
+ final Set evidenceB = new HashSet<>(b.getEvidence());
+ // SR < PE and if neither they are considered equal
+ // Note sorting is in ascending order, and we want the highest-priority record first
+ if (evidenceA.contains(GATKSVVCFConstants.EvidenceTypes.SR) && !evidenceB.contains(GATKSVVCFConstants.EvidenceTypes.SR)) {
+ return -1;
+ } else if (!evidenceA.contains(GATKSVVCFConstants.EvidenceTypes.SR) && evidenceB.contains(GATKSVVCFConstants.EvidenceTypes.SR)) {
+ return 1;
+ } else if (evidenceA.contains(GATKSVVCFConstants.EvidenceTypes.PE) && !evidenceB.contains(GATKSVVCFConstants.EvidenceTypes.PE)) {
+ return -1;
+ } else if (!evidenceA.contains(GATKSVVCFConstants.EvidenceTypes.PE) && evidenceB.contains(GATKSVVCFConstants.EvidenceTypes.PE)) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+ }
+
protected static long getDistance(final int posA,
final int posB,
final int[] starts,
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngine.java
index 0199eaae3e3..1100809f59d 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngine.java
@@ -23,8 +23,6 @@
*
* NOTE: precise implementation of {@link SVClusterLinkage#getMaxClusterableStartingPosition(SVLocatable)}
* is important for efficiency because it determines when a cluster can be finalized and omitted from further clustering tests.
- *
- * @param class of items to cluster
*/
public class SVClusterEngine {
@@ -41,7 +39,6 @@ public enum CLUSTERING_TYPE {
private Map idToClusterMap; // Active clusters
private final Map idToItemMap; // Active items
protected final CLUSTERING_TYPE clusteringType;
- private final ItemSortingBuffer buffer;
private final Comparator itemComparator;
private String currentContig;
@@ -65,30 +62,12 @@ public SVClusterEngine(final CLUSTERING_TYPE clusteringType,
currentContig = null;
idToItemMap = new HashMap<>();
itemComparator = SVCallRecordUtils.getSVLocatableComparator(dictionary);
- buffer = new ItemSortingBuffer();
nextItemId = 0;
nextClusterId = 0;
lastStart = 0;
minActiveStartingPositionItemId = null;
}
-
- /**
- * Flushes all active clusters, adding them to the output buffer. Results from the output buffer are then copied out
- * and the buffer is cleared. This should be called between contigs to save memory.
- */
- public final List forceFlush() {
- flushClusters();
- return buffer.forceFlush();
- }
-
- /**
- * Gets any available finalized clusters.
- */
- public final List flush() {
- return buffer.flush();
- }
-
@VisibleForTesting
public Function getCollapser() {
return collapser;
@@ -109,25 +88,26 @@ public SVCallRecord getMinActiveStartingPositionItem() {
* Returns true if there are any active or finalized clusters.
*/
public final boolean isEmpty() {
- return idToClusterMap.isEmpty() && buffer.isEmpty();
+ return idToClusterMap.isEmpty();
}
/**
* Adds and clusters the given item. Note that items must be added in order of increasing start position.
* @param item item to cluster
*/
- public final void add(final SVCallRecord item) {
+ public final List addAndFlush(final SVCallRecord item) {
// Start a new cluster if on a new contig
if (!item.getContigA().equals(currentContig)) {
- flushClusters();
+ final List result = flush();
currentContig = item.getContigA();
lastStart = 0;
seedCluster(registerItem(item));
- return;
+ return result;
+ } else {
+ final int itemId = registerItem(item);
+ final List clusterIdsToProcess = cluster(itemId);
+ return processClusters(clusterIdsToProcess);
}
- final int itemId = registerItem(item);
- final List clusterIdsToProcess = cluster(itemId);
- processClusters(clusterIdsToProcess);
}
private final int registerItem(final SVCallRecord item) {
@@ -263,12 +243,12 @@ private final void combineClusters(final Collection clusterIds, final I
/**
* Finalizes a single cluster, removing it from the currently active set and adding it to the output buffer.
*/
- private final void processCluster(final int clusterIndex) {
+ private final SVCallRecord processCluster(final int clusterIndex) {
final Cluster cluster = getCluster(clusterIndex);
idToClusterMap.remove(clusterIndex);
final List clusterItemIds = cluster.getItemIds();
final OutputCluster outputCluster = new OutputCluster(clusterItemIds.stream().map(idToItemMap::get).collect(Collectors.toList()));
- buffer.add(collapser.apply(outputCluster));
+ final SVCallRecord result = collapser.apply(outputCluster);
// Clean up item id map
if (clusterItemIds.size() == 1) {
// Singletons won't be present in any other clusters
@@ -289,6 +269,7 @@ private final void processCluster(final int clusterIndex) {
if (clusterItemIds.contains(minActiveStartingPositionItemId)) {
findAndSetMinActiveStart();
}
+ return result;
}
/**
@@ -309,25 +290,29 @@ private final void findAndSetMinActiveStart() {
/**
* Finalizes a set of clusters.
*/
- private final void processClusters(final List clusterIdsToProcess) {
+ private final List processClusters(final List clusterIdsToProcess) {
+ final List result = new ArrayList<>(clusterIdsToProcess.size());
for (final Integer clusterId : clusterIdsToProcess) {
- processCluster(clusterId);
+ result.add(processCluster(clusterId));
}
+ return result;
}
/**
* Finalizes all active clusters and adds them to the output buffer. Also clears the currently active set of clusters
* and items.
*/
- private final void flushClusters() {
+ public final List flush() {
final List clustersToFlush = new ArrayList<>(idToClusterMap.keySet());
+ final List result = new ArrayList<>(clustersToFlush.size());
for (final Integer clusterId : clustersToFlush) {
- processCluster(clusterId);
+ result.add(processCluster(clusterId));
}
idToItemMap.clear();
minActiveStartingPositionItemId = null;
nextItemId = 0;
nextClusterId = 0;
+ return result;
}
/**
@@ -431,52 +416,4 @@ public int hashCode() {
return Objects.hash(itemIds);
}
}
-
- private final class ItemSortingBuffer {
- private PriorityQueue buffer;
-
- public ItemSortingBuffer() {
- Utils.nonNull(itemComparator);
- this.buffer = new PriorityQueue<>(itemComparator);
- }
-
- public void add(final SVCallRecord record) {
- buffer.add(record);
- }
-
- /**
- * Returns any records that can be safely flushed based on the current minimum starting position
- * of items still being actively clustered.
- */
- public List flush() {
- if (buffer.isEmpty()) {
- return Collections.emptyList();
- }
- final SVCallRecord minActiveStartItem = getMinActiveStartingPositionItem();
- if (minActiveStartItem == null) {
- forceFlush();
- }
- final List out = new ArrayList<>();
- while (!buffer.isEmpty() && buffer.comparator().compare(buffer.peek(), minActiveStartItem) < 0) {
- out.add(buffer.poll());
- }
- return out;
- }
-
- /**
- * Returns all buffered records, regardless of any active clusters. To be used only when certain that no
- * active clusters can be clustered with any future inputs.
- */
- public List forceFlush() {
- final List result = new ArrayList<>(buffer.size());
- while (!buffer.isEmpty()) {
- result.add(buffer.poll());
- }
- return result;
- }
-
- public boolean isEmpty() {
- return buffer.isEmpty();
- }
- }
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineFactory.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineFactory.java
index 8f3bcaf6112..525910cf2d8 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineFactory.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineFactory.java
@@ -25,7 +25,7 @@ public static SVClusterEngine createCanonical(final SVClusterEngine.CLUSTERING_T
linkage.setDepthOnlyParams(depthParameters);
linkage.setMixedParams(mixedParameters);
linkage.setEvidenceParams(pesrParameters);
- final CanonicalSVCollapser collapser = new CanonicalSVCollapser(reference, altAlleleSummaryStrategy, breakpointSummaryStrategy);
+ final CanonicalSVCollapser collapser = new CanonicalSVCollapser(reference, altAlleleSummaryStrategy, breakpointSummaryStrategy, CanonicalSVCollapser.FlagFieldLogic.OR);
return new SVClusterEngine(type, collapser::collapse, linkage, dictionary);
}
@@ -35,7 +35,7 @@ public static SVClusterEngine createCNVDefragmenter(final SAMSequenceDictionary
final double paddingFraction,
final double minSampleOverlap) {
final SVClusterLinkage linkage = new CNVLinkage(dictionary, paddingFraction, minSampleOverlap);
- final CanonicalSVCollapser collapser = new CanonicalSVCollapser(reference, altAlleleSummaryStrategy, CanonicalSVCollapser.BreakpointSummaryStrategy.MIN_START_MAX_END);
+ final CanonicalSVCollapser collapser = new CanonicalSVCollapser(reference, altAlleleSummaryStrategy, CanonicalSVCollapser.BreakpointSummaryStrategy.MIN_START_MAX_END, CanonicalSVCollapser.FlagFieldLogic.OR);
return new SVClusterEngine(SVClusterEngine.CLUSTERING_TYPE.SINGLE_LINKAGE, collapser::collapse, linkage, dictionary);
}
@@ -46,7 +46,7 @@ public static SVClusterEngine createBinnedCNVDefragmenter(final SAMSequenceDicti
final double minSampleOverlap,
final List coverageIntervals) {
final SVClusterLinkage linkage = new BinnedCNVLinkage(dictionary, paddingFraction, minSampleOverlap, coverageIntervals);
- final CanonicalSVCollapser collapser = new CanonicalSVCollapser(reference, altAlleleSummaryStrategy, CanonicalSVCollapser.BreakpointSummaryStrategy.MIN_START_MAX_END);
+ final CanonicalSVCollapser collapser = new CanonicalSVCollapser(reference, altAlleleSummaryStrategy, CanonicalSVCollapser.BreakpointSummaryStrategy.MIN_START_MAX_END, CanonicalSVCollapser.FlagFieldLogic.OR);
return new SVClusterEngine(SVClusterEngine.CLUSTERING_TYPE.SINGLE_LINKAGE, collapser::collapse, linkage, dictionary);
}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterLinkage.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterLinkage.java
index 433ec4ab46e..57293b6a95b 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterLinkage.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterLinkage.java
@@ -77,17 +77,17 @@ protected static boolean hasSampleOverlap(final SVCallRecord a, final SVCallReco
final Set samples = new HashSet<>(SVUtils.hashMapCapacity(genotypesA.size() + genotypesB.size()));
samples.addAll(genotypesA.getSampleNames());
samples.addAll(genotypesB.getSampleNames());
+ if (samples.isEmpty()) {
+ // Empty case considered perfect overlap
+ return true;
+ }
int numMatches = 0;
for (final String sample : samples) {
final Genotype genotypeA = genotypesA.get(sample);
final Genotype genotypeB = genotypesB.get(sample);
// If one sample doesn't exist in the other set, assume reference copy state
- final int cnA = genotypeA == null ?
- VariantContextGetters.getAttributeAsInt(genotypeB, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 0)
- : VariantContextGetters.getAttributeAsInt(genotypeA, GATKSVVCFConstants.COPY_NUMBER_FORMAT, 0);
- final int cnB = genotypeB == null ?
- VariantContextGetters.getAttributeAsInt(genotypeA, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 0)
- : VariantContextGetters.getAttributeAsInt(genotypeB, GATKSVVCFConstants.COPY_NUMBER_FORMAT, 0);
+ final int cnA = getCopyState(genotypeA, genotypeB);
+ final int cnB = getCopyState(genotypeB, genotypeA);
if (cnA == cnB) {
numMatches++;
}
@@ -105,4 +105,20 @@ protected static boolean hasSampleOverlap(final SVCallRecord a, final SVCallReco
}
}
+ /**
+ * Tries to get the best copy state from the genotype. If the genotype is null, uses ploidy from a "backup"
+ * genotype as the default. If we have no clue, just return -1 as a null default.
+ */
+ private static int getCopyState(final Genotype genotype, final Genotype matchedSampleGenotype) {
+ if (genotype == null) {
+ if (matchedSampleGenotype != null) {
+ return VariantContextGetters.getAttributeAsInt(matchedSampleGenotype, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, -1);
+ } else {
+ throw new IllegalArgumentException("Both genotypes are null");
+ }
+ } else {
+ return VariantContextGetters.getAttributeAsInt(genotype, GATKSVVCFConstants.COPY_NUMBER_FORMAT,
+ VariantContextGetters.getAttributeAsInt(genotype, GATKSVVCFConstants.DEPTH_GENOTYPE_COPY_NUMBER_FORMAT, -1));
+ }
+ }
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterWalker.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterWalker.java
new file mode 100644
index 00000000000..96db9939f03
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterWalker.java
@@ -0,0 +1,281 @@
+package org.broadinstitute.hellbender.tools.sv.cluster;
+
+import htsjdk.samtools.SAMSequenceDictionary;
+import htsjdk.samtools.reference.ReferenceSequenceFile;
+import htsjdk.samtools.util.SortingCollection;
+import htsjdk.variant.variantcontext.GenotypesContext;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.variantcontext.VariantContextBuilder;
+import htsjdk.variant.variantcontext.writer.VariantContextWriter;
+import htsjdk.variant.vcf.*;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.engine.*;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFHeaderLines;
+import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
+import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils;
+import org.broadinstitute.hellbender.tools.walkers.sv.JointGermlineCNVSegmentation;
+import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;
+
+import java.util.Set;
+
+import static org.broadinstitute.hellbender.tools.walkers.sv.JointGermlineCNVSegmentation.BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME;
+import static org.broadinstitute.hellbender.tools.walkers.sv.JointGermlineCNVSegmentation.FLAG_FIELD_LOGIC_LONG_NAME;
+
+/***
+ * Base class for tools that a simple interface for utilizing {@link SVClusterEngine}. It handles input/output easily,
+ * including output sorting with spilling to disk to avoid excessive memory usage.
+ */
+public abstract class SVClusterWalker extends MultiVariantWalker {
+ public static final String PLOIDY_TABLE_LONG_NAME = "ploidy-table";
+ public static final String VARIANT_PREFIX_LONG_NAME = "variant-prefix";
+ public static final String ENABLE_CNV_LONG_NAME = "enable-cnv";
+ public static final String ALGORITHM_LONG_NAME = "algorithm";
+ public static final String FAST_MODE_LONG_NAME = "fast-mode";
+ public static final String OMIT_MEMBERS_LONG_NAME = "omit-members";
+ public static final String DEFAULT_NO_CALL_LONG_NAME = "default-no-call";
+ public static final String MAX_RECORDS_IN_RAM_LONG_NAME = "max-records-in-ram";
+
+ /**
+ * The enum Cluster algorithm.
+ */
+ public enum CLUSTER_ALGORITHM {
+ /**
+ * Defragment cnv cluster algorithm. Not supported with stratification.
+ */
+ DEFRAGMENT_CNV,
+ /**
+ * Single linkage cluster algorithm.
+ */
+ SINGLE_LINKAGE,
+ /**
+ * Max clique cluster algorithm.
+ */
+ MAX_CLIQUE
+ }
+
+ @Argument(
+ doc = "Output VCF",
+ fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
+ shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME
+ )
+ protected GATKPath outputFile;
+
+ /**
+ * Expected format is tab-delimited and contains a header with the first column SAMPLE and remaining columns
+ * contig names. Each row corresponds to a sample, with the sample ID in the first column and contig ploidy
+ * integers in their respective columns.
+ */
+ @Argument(
+ doc = "Sample ploidy table (.tsv)",
+ fullName = PLOIDY_TABLE_LONG_NAME
+ )
+ protected GATKPath ploidyTablePath;
+
+ @Argument(
+ doc = "If supplied, generate variant IDs with this prefix",
+ fullName = VARIANT_PREFIX_LONG_NAME,
+ optional = true
+ )
+ protected String variantPrefix = null;
+
+ /**
+ * When enabled, DEL and DUP variants will be clustered together. The resulting records with have an SVTYPE of CNV.
+ */
+ @Argument(
+ doc = "Enable clustering DEL/DUP variants together as CNVs (does not apply to CNV defragmentation)",
+ fullName = ENABLE_CNV_LONG_NAME,
+ optional = true
+ )
+ protected boolean enableCnv = false;
+
+ /**
+ * Results in substantial space and time costs for large sample sets by clearing genotypes that are not needed for
+ * clustering, but any associated annotation fields will be set to null in the output.
+ */
+ @Argument(
+ doc = "Fast mode. Drops hom-ref and missing genotype fields and emits them as missing.",
+ fullName = FAST_MODE_LONG_NAME,
+ optional = true
+ )
+ protected boolean fastMode = false;
+
+ @Argument(
+ doc = "Omit cluster member ID annotations",
+ fullName = OMIT_MEMBERS_LONG_NAME,
+ optional = true
+ )
+ protected boolean omitMembers = false;
+
+ @Argument(fullName = BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME,
+ doc = "Strategy to use for choosing a representative value for a breakpoint cluster.",
+ optional = true)
+ protected CanonicalSVCollapser.BreakpointSummaryStrategy breakpointSummaryStrategy =
+ CanonicalSVCollapser.BreakpointSummaryStrategy.REPRESENTATIVE;
+
+ @Argument(fullName = JointGermlineCNVSegmentation.ALT_ALLELE_SUMMARY_STRATEGY_LONG_NAME,
+ doc = "Strategy to use for choosing a representative alt allele for non-CNV biallelic sites with " +
+ "different subtypes.",
+ optional = true)
+ protected CanonicalSVCollapser.AltAlleleSummaryStrategy altAlleleSummaryStrategy =
+ CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE;
+
+ @Argument(fullName = FLAG_FIELD_LOGIC_LONG_NAME,
+ doc = "Logic for collapsing Flag type INFO and FORMAT fields",
+ optional = true)
+ protected CanonicalSVCollapser.FlagFieldLogic flagFieldLogic = CanonicalSVCollapser.FlagFieldLogic.OR;
+
+ @Argument(fullName = ALGORITHM_LONG_NAME,
+ doc = "Clustering algorithm",
+ optional = true
+ )
+ protected CLUSTER_ALGORITHM algorithm = CLUSTER_ALGORITHM.SINGLE_LINKAGE;
+
+ /**
+ * Default genotypes are assigned when they cannot be inferred from the inputs, such as when VCFs with different
+ * variants and samples are provided.
+ */
+ @Argument(fullName = DEFAULT_NO_CALL_LONG_NAME,
+ doc = "Default to no-call GT (e.g. ./.) instead of reference alleles (e.g. 0/0) when a genotype is not" +
+ " available",
+ optional = true
+ )
+ protected boolean defaultNoCall = false;
+
+ @Argument(fullName = MAX_RECORDS_IN_RAM_LONG_NAME,
+ doc = "When writing VCF files that need to be sorted, this will specify the number of records stored in " +
+ "RAM before spilling to disk. Increasing this number reduces the number of file handles needed to sort a " +
+ "VCF file, and increases the amount of RAM needed.",
+ optional=true)
+ public int maxRecordsInRam = 10000;
+
+ protected SAMSequenceDictionary dictionary;
+ protected ReferenceSequenceFile reference;
+ protected PloidyTable ploidyTable;
+ protected SortingCollection sortingBuffer;
+ protected VariantContextWriter writer;
+ protected VCFHeader header;
+ protected Set samples;
+ protected String currentContig;
+ protected int numVariantsBuilt = 0;
+
+ @Override
+ public boolean requiresReference() {
+ return true;
+ }
+
+ @Override
+ public void onTraversalStart() {
+ reference = ReferenceUtils.createReferenceReader(referenceArguments.getReferenceSpecifier());
+ dictionary = reference.getSequenceDictionary();
+ if (dictionary == null) {
+ throw new UserException("Reference sequence dictionary required");
+ }
+ ploidyTable = new PloidyTable(ploidyTablePath.toPath());
+ samples = getSamplesForVariants();
+ writer = createVCFWriter(outputFile);
+ header = createHeader();
+ writer.writeHeader(header);
+ currentContig = null;
+ sortingBuffer = SortingCollection.newInstance(
+ VariantContext.class,
+ new VCFRecordCodec(header, true),
+ header.getVCFRecordComparator(),
+ maxRecordsInRam,
+ tmpDir.toPath());
+ }
+
+ @Override
+ public Object onTraversalSuccess() {
+ for (final VariantContext variant : sortingBuffer) {
+ writer.add(variant);
+ }
+ return super.onTraversalSuccess();
+ }
+
+ @Override
+ public void closeTool() {
+ super.closeTool();
+ if (sortingBuffer != null) {
+ sortingBuffer.cleanup();
+ }
+ if (writer != null) {
+ writer.close();
+ }
+ }
+
+ /**
+ * Subclasses should override this method
+ */
+ public abstract void applyRecord(final SVCallRecord record);
+
+ @Override
+ public void apply(final VariantContext variant, final ReadsContext readsContext,
+ final ReferenceContext referenceContext, final FeatureContext featureContext) {
+ SVCallRecord call = SVCallRecordUtils.create(variant, dictionary);
+ if (fastMode && call.getType() != GATKSVVCFConstants.StructuralVariantAnnotationType.CNV) {
+ // Strip out non-carrier genotypes to save memory and compute
+ // Don't do for multi-allelic CNVs since carrier status can't be determined
+ final GenotypesContext filteredGenotypes = GenotypesContext.copy(call.getCarrierGenotypeList());
+ call = SVCallRecordUtils.copyCallWithNewGenotypes(call, filteredGenotypes);
+ }
+ // Update current contig
+ if (!call.getContigA().equals(currentContig)) {
+ currentContig = call.getContigA();
+ logger.info("Processing contig " + currentContig + "...");
+ }
+ applyRecord(call);
+ }
+
+ protected VCFHeader createHeader() {
+ final VCFHeader header = new VCFHeader(getHeaderForVariants().getMetaDataInInputOrder(), samples);
+ header.setSequenceDictionary(dictionary);
+
+ // Required info lines
+ header.addMetaDataLine(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
+ header.addMetaDataLine(GATKSVVCFHeaderLines.getInfoLine(GATKSVVCFConstants.SVLEN));
+ header.addMetaDataLine(GATKSVVCFHeaderLines.getInfoLine(GATKSVVCFConstants.SVTYPE));
+ header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.END2_ATTRIBUTE, 1,
+ VCFHeaderLineType.Integer, "Second position"));
+ header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.CONTIG2_ATTRIBUTE, 1,
+ VCFHeaderLineType.String, "Second contig"));
+ header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.STRANDS_ATTRIBUTE, 1,
+ VCFHeaderLineType.String, "First and second strands"));
+ header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE,
+ VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Source algorithms"));
+ if (!omitMembers) {
+ header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY,
+ VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Cluster variant ids"));
+ }
+ // Required format lines
+ header.addMetaDataLine(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY));
+ return header;
+ }
+
+ protected void write(final SVCallRecord call) {
+ sortingBuffer.add(buildVariantContext(call));
+ }
+
+ protected VariantContext buildVariantContext(final SVCallRecord call) {
+ // Add genotypes for missing samples
+ final GenotypesContext filledGenotypes = SVCallRecordUtils.populateGenotypesForMissingSamplesWithAlleles(
+ call, samples, !defaultNoCall, ploidyTable, header);
+
+ // Assign new variant ID
+ final String newId = variantPrefix == null ? call.getId() : String.format("%s%08x", variantPrefix, numVariantsBuilt++);
+
+ // Build new variant
+ final SVCallRecord finalCall = new SVCallRecord(newId, call.getContigA(), call.getPositionA(), call.getStrandA(),
+ call.getContigB(), call.getPositionB(), call.getStrandB(), call.getType(), call.getComplexSubtype(),
+ call.getComplexEventIntervals(), call.getLength(), call.getEvidence(), call.getAlgorithms(), call.getAlleles(), filledGenotypes,
+ call.getAttributes(), call.getFilters(), call.getLog10PError(), dictionary);
+ final VariantContextBuilder builder = SVCallRecordUtils.getVariantBuilder(finalCall);
+ if (omitMembers) {
+ builder.rmAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY);
+ }
+ return builder.make();
+ }
+
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/StratifiedClusteringTableParser.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/StratifiedClusteringTableParser.java
new file mode 100644
index 00000000000..2bc5720393b
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/StratifiedClusteringTableParser.java
@@ -0,0 +1,44 @@
+package org.broadinstitute.hellbender.tools.sv.cluster;
+
+import com.google.common.collect.ImmutableSet;
+import org.broadinstitute.hellbender.utils.tsv.DataLine;
+import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
+
+import java.util.Set;
+import java.util.function.Function;
+
+public class StratifiedClusteringTableParser {
+
+ // Configuration table column names
+ public static final String NAME_COLUMN = "NAME";
+ public static final String RECIPROCAL_OVERLAP_COLUMN = "RECIPROCAL_OVERLAP";
+ public static final String SIZE_SIMILARITY_COLUMN = "SIZE_SIMILARITY";
+ public static final String BREAKEND_WINDOW_COLUMN = "BREAKEND_WINDOW";
+ public static final String SAMPLE_OVERLAP_COLUMN = "SAMPLE_OVERLAP";
+ protected static final Set COLUMN_NAMES = ImmutableSet.of(NAME_COLUMN, RECIPROCAL_OVERLAP_COLUMN, SIZE_SIMILARITY_COLUMN, BREAKEND_WINDOW_COLUMN, SAMPLE_OVERLAP_COLUMN);
+
+ public static Function tableParser(TableColumnCollection columns, Function exceptionFactory) {
+ for (final String column : COLUMN_NAMES) {
+ if (!columns.contains(column)) {
+ throw exceptionFactory.apply("Missing column " + column);
+ }
+ }
+ if (columns.columnCount() != COLUMN_NAMES.size()) {
+ throw exceptionFactory.apply("Expected " + columns.columnCount() + " columns but found " + columns.columnCount());
+ }
+ return StratifiedClusteringTableParser::parseTableLine;
+ }
+
+ protected static StratumParameters parseTableLine(final DataLine dataLine) {
+ final String name = dataLine.get(NAME_COLUMN);
+ final double reciprocalOverlap = dataLine.getDouble(RECIPROCAL_OVERLAP_COLUMN);
+ final double sizeSimilarity = dataLine.getDouble(SIZE_SIMILARITY_COLUMN);
+ final double sampleOverlap = dataLine.getDouble(SAMPLE_OVERLAP_COLUMN);
+ final int breakendWindow = dataLine.getInt(BREAKEND_WINDOW_COLUMN);
+ return new StratumParameters(name, reciprocalOverlap, sizeSimilarity, breakendWindow, sampleOverlap);
+ }
+
+ public record StratumParameters(String name, double reciprocalOverlap, double sizeSimilarity,
+ int breakendWindow, double sampleOverlap) {
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStatificationEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStatificationEngine.java
new file mode 100644
index 00000000000..9083aa83886
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStatificationEngine.java
@@ -0,0 +1,349 @@
+package org.broadinstitute.hellbender.tools.sv.stratify;
+
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Lists;
+import htsjdk.samtools.SAMSequenceDictionary;
+import htsjdk.samtools.util.Locatable;
+import htsjdk.samtools.util.OverlapDetector;
+import org.broadinstitute.hellbender.engine.GATKPath;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
+import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
+import org.broadinstitute.hellbender.utils.IntervalMergingRule;
+import org.broadinstitute.hellbender.utils.IntervalUtils;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.tsv.DataLine;
+import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
+import org.broadinstitute.hellbender.utils.tsv.TableReader;
+import org.broadinstitute.hellbender.utils.tsv.TableUtils;
+
+import java.io.IOException;
+import java.util.*;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+
+// Groups variants by SVTYPE, SVLEN, and overlap with one or more interval sets
+public class SVStatificationEngine {
+
+ // Configuration table column names
+ public static final String NAME_COLUMN = "NAME";
+ public static final String SVTYPE_COLUMN = "SVTYPE";
+ public static final String MIN_SIZE_COLUMN = "MIN_SIZE";
+ public static final String MAX_SIZE_COLUMN = "MAX_SIZE";
+ public static final String TRACK_COLUMN = "TRACKS";
+ protected static final Set COLUMN_NAMES = ImmutableSet.of(NAME_COLUMN, SVTYPE_COLUMN, MIN_SIZE_COLUMN, MAX_SIZE_COLUMN, TRACK_COLUMN);
+ public static final String TRACK_COLUMN_DELIMITER = ",";
+
+ public static final Set NULL_TABLE_VALUES = Set.of("-1", "", "NULL", "NA");
+
+ protected final Map> trackMap;
+ protected final Map strata;
+ protected final SAMSequenceDictionary dictionary;
+
+ public SVStatificationEngine(final SAMSequenceDictionary dictionary) {
+ trackMap = new HashMap<>();
+ strata = new HashMap<>();
+ this.dictionary = Utils.nonNull(dictionary);
+ }
+
+ public void addTrack(final String name, final List intervals) {
+ Utils.nonNull(name);
+ Utils.nonNull(intervals);
+ Utils.validateArg(!trackMap.containsKey(name), "Track with name " + name + " already exists");
+ trackMap.put(name, OverlapDetector.create(intervals));
+ }
+
+ /**
+ * Adds a new stratification group
+ * @param name a unique ID
+ * @param svType SV type, may be null
+ * @param minSize minimum size in bp (inclusive), may be null
+ * @param maxSize maximum size in bp (exclusive), may be null
+ * @param trackNames reference track names
+ */
+ public void addStratification(final String name, final GATKSVVCFConstants.StructuralVariantAnnotationType svType,
+ final Integer minSize, final Integer maxSize, final Set trackNames) {
+ addStratification(new Stratum(name, svType, minSize, maxSize, trackNames));
+ }
+
+ protected void addStratification(final Stratum stratification) {
+ if (strata.containsKey(stratification.getName())) {
+ throw new GATKException("Encountered duplicate name " + stratification.getName());
+ }
+ strata.put(stratification.getName(), stratification);
+ }
+
+ /**
+ * Retrieves intervals for the given track
+ * @param name track ID
+ * @return searchable interval set
+ */
+ public OverlapDetector getTrackIntervals(final String name) {
+ return trackMap.get(name);
+ }
+
+ /**
+ * Factory method for creating a new engine from a config file and set of reference tracks. The config file
+ * is a table parsable by {@link TableReader}, with mandatory columns defined in {@link #COLUMN_NAMES}.
+ * @param trackMap map from reference track name to interval set
+ * @param configFilePath path to stratification config table
+ * @param dictionary reference dict
+ * @return new engine
+ */
+ public static SVStatificationEngine create(final Map> trackMap,
+ final GATKPath configFilePath,
+ final SAMSequenceDictionary dictionary) {
+ Utils.nonNull(trackMap);
+ Utils.nonNull(configFilePath);
+ final SVStatificationEngine engine = new SVStatificationEngine(dictionary);
+ for (final Map.Entry> entry : trackMap.entrySet()) {
+ engine.addTrack(entry.getKey(), entry.getValue());
+ }
+ try (final TableReader tableReader = TableUtils.reader(configFilePath.toPath(), engine::tableParser)) {
+ for (final Stratum stratification : tableReader) {
+ engine.addStratification(stratification);
+ }
+ } catch (final IOException e) {
+ throw new GATKException("IO error while reading config table", e);
+ }
+ return engine;
+ }
+
+ /**
+ * Get all stratification groups matching a given query record.
+ * @param record query record
+ * @param overlapFraction minimum overlap fraction (0 to 1)
+ * @param numBreakpointOverlaps minimum number of breakpoint ends that must lie in the reference track(s) (0, 1, 2)
+ * @param numBreakpointOverlapsInterchrom minimum breakpoint ends for interchromosomal variants (1, 2)
+ * @return all matching strata
+ */
+ public Collection getMatches(final SVCallRecord record, final double overlapFraction, final int numBreakpointOverlaps, final int numBreakpointOverlapsInterchrom) {
+ Utils.nonNull(record);
+ final List result = new ArrayList<>();
+ for (final Stratum stratification : strata.values()) {
+ if (stratification.matches(record, overlapFraction, numBreakpointOverlaps, numBreakpointOverlapsInterchrom)) {
+ result.add(stratification);
+ }
+ }
+ return result;
+ }
+
+ protected Function tableParser(TableColumnCollection columns, Function exceptionFactory) {
+ // Check for expected columns
+ for (final String column : COLUMN_NAMES) {
+ if (!columns.contains(column)) {
+ throw exceptionFactory.apply("Missing column " + column);
+ }
+ }
+ // Check there are no extra columns
+ if (columns.columnCount() != COLUMN_NAMES.size()) {
+ throw exceptionFactory.apply("Expected " + columns.columnCount() + " columns but found " + columns.columnCount());
+ }
+ return this::parseTableLine;
+ }
+
+ protected Stratum parseTableLine(final DataLine dataLine) {
+ final GATKSVVCFConstants.StructuralVariantAnnotationType svType = GATKSVVCFConstants.StructuralVariantAnnotationType.valueOf(dataLine.get(SVTYPE_COLUMN));
+ final String name = dataLine.get(NAME_COLUMN);
+ final Integer minSize = parseIntegerMaybeNull(dataLine.get(MIN_SIZE_COLUMN));
+ final Integer maxSize = parseIntegerMaybeNull(dataLine.get(MAX_SIZE_COLUMN));
+ final Set trackNames = parseTrackString(dataLine.get(TRACK_COLUMN));
+ return new Stratum(name, svType, minSize, maxSize, trackNames);
+ }
+
+ protected Set parseTrackString(final String val) {
+ if (NULL_TABLE_VALUES.contains(val)) {
+ return Collections.emptySet();
+ } else {
+ final String[] trackArray = val.split(TRACK_COLUMN_DELIMITER);
+ for (final String track : trackArray) {
+ if (!trackMap.containsKey(track)) {
+ throw new GATKException("Could not find track with name " + track);
+ }
+ }
+ return Lists.newArrayList(trackArray).stream().collect(Collectors.toUnmodifiableSet());
+ }
+ }
+
+ protected Integer parseIntegerMaybeNull(final String val) {
+ if (NULL_TABLE_VALUES.contains(val)) {
+ return null;
+ } else {
+ return Integer.valueOf(val);
+ }
+ }
+
+ public Collection getStrata() {
+ return strata.values();
+ }
+
+ public class Stratum {
+
+ final GATKSVVCFConstants.StructuralVariantAnnotationType svType;
+ final int minSize; // inclusive
+ final int maxSize; // exclusive
+ final List trackNames;
+ final String name;
+
+ Stratum(final String name, final GATKSVVCFConstants.StructuralVariantAnnotationType svType,
+ final Integer minSize, final Integer maxSize, final Set trackNames) {
+ this.name = Utils.nonNull(name);
+ for (final String trackName : trackNames) {
+ if (trackName != null && !trackMap.containsKey(trackName)) {
+ throw new IllegalArgumentException("Unregistered track name " + trackName);
+ }
+ }
+ if (maxSize != null && minSize != null && maxSize <= minSize) {
+ throw new IllegalArgumentException("Min size must be strictly less than max size");
+ }
+ if (maxSize != null && maxSize < 0) {
+ throw new IllegalArgumentException("Max size cannot be less than 0");
+ }
+ if (maxSize != null && maxSize == Integer.MAX_VALUE) {
+ throw new IllegalArgumentException("Max size " + Integer.MAX_VALUE + " is reserved");
+ }
+ if (minSize != null && minSize < 0) {
+ throw new IllegalArgumentException("Min size cannot be less than 0");
+ }
+ if ((svType == GATKSVVCFConstants.StructuralVariantAnnotationType.BND || svType == GATKSVVCFConstants.StructuralVariantAnnotationType.CTX) && (minSize != null || maxSize != null)) {
+ throw new IllegalArgumentException("BND/CTX categories cannot have min or max size (" + name + ")");
+ }
+ this.svType = svType;
+ // Map min from any negative number to negative infinity
+ if (minSize == null) {
+ this.minSize = Integer.MIN_VALUE;
+ } else {
+ this.minSize = minSize;
+ }
+ // Map max from any negative number to infinity
+ if (maxSize == null) {
+ this.maxSize = Integer.MAX_VALUE;
+ } else {
+ this.maxSize = maxSize;
+ }
+ this.trackNames = trackNames.stream().sorted().collect(Collectors.toList());
+ }
+
+ protected boolean matches(final SVCallRecord record, final double overlapFraction,
+ final int numBreakpointOverlaps, final int numBreakpointOverlapsInterchrom) {
+ return matchesType(record) && matchesSize(record) && matchesTracks(record, overlapFraction, numBreakpointOverlaps, numBreakpointOverlapsInterchrom);
+ }
+
+ protected boolean matchesType(final SVCallRecord record) {
+ return record.getType() == svType;
+ }
+
+ protected boolean matchesSize(final SVCallRecord record) {
+ final Integer length = record.getLength();
+ if (length == null) {
+ // Undefined length requires null min/max boundaries
+ return minSize == Integer.MIN_VALUE && maxSize == Integer.MAX_VALUE;
+ } else {
+ return length >= minSize && length < maxSize;
+ }
+ }
+
+ /**
+ * Determines whether a given query record belongs to this track.
+ * @param record query record
+ * @param overlapFraction minimum variant interval overlap fraction
+ * @param numBreakpointOverlaps minimum number of breakpoint ends that must lie in the track
+ * @param numBreakpointOverlapsInterchrom minimum breakpoint ends if the variant is interchromosomal
+ * @return true if the SV matches the tracks of this stratum
+ */
+ public boolean matchesTracks(final SVCallRecord record,
+ final double overlapFraction,
+ final int numBreakpointOverlaps,
+ final int numBreakpointOverlapsInterchrom) {
+ Utils.nonNull(record);
+ Utils.validate(overlapFraction >= 0 && overlapFraction <= 1,
+ "Overlap fraction threshold " + overlapFraction + " must be on [0, 1]");
+ Utils.validate(numBreakpointOverlaps >= 0 && numBreakpointOverlaps <= 2,
+ "Breakpoint overlaps threshold " + numBreakpointOverlaps + " must be 0, 1, or 2");
+ Utils.validate(numBreakpointOverlapsInterchrom == 1 || numBreakpointOverlapsInterchrom == 2,
+ "Interchromosomal breakpoint overlaps threshold " + numBreakpointOverlapsInterchrom + " must be 1 or 2");
+ Utils.validate(!(overlapFraction == 0 && numBreakpointOverlaps == 0),
+ "Overlap fraction and overlapping breakpoints thresholds cannot both be 0");
+ if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INS) {
+ // Just require the insertion locus to fall in an interval
+ return matchesTrackBreakpointOverlap(record, 1);
+ } else if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.BND || record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CTX) {
+ // Interchromosomal variants
+ return matchesTrackBreakpointOverlap(record, numBreakpointOverlapsInterchrom);
+ } else {
+ return matchesTrackIntrachromosomal(record, overlapFraction, numBreakpointOverlaps);
+ }
+ }
+
+ protected boolean matchesTrackIntrachromosomal(final SVCallRecord record,
+ final double overlapFraction,
+ final int numBreakpointOverlaps) {
+ return matchesTrackOverlapFraction(record, overlapFraction) && matchesTrackBreakpointOverlap(record, numBreakpointOverlaps);
+ }
+
+ protected boolean matchesTrackOverlapFraction(final SVCallRecord record, final double overlapFraction) {
+ if (overlapFraction > 0 && !trackNames.isEmpty()) {
+ if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) {
+ throw new GATKException("Track overlap for CPX types not currently supported (" + name + ")");
+ }
+ final SimpleInterval interval = new SimpleInterval(record.getContigA(), record.getPositionA(), record.getPositionB());
+ final List overlaps = new ArrayList<>();
+ for (final String track : trackNames) {
+ overlaps.addAll(trackMap.get(track).getOverlaps(interval).stream().map(SimpleInterval::new).collect(Collectors.toList()));
+ }
+ final List mergedOverlaps = IntervalUtils.sortAndMergeIntervals(overlaps, dictionary, IntervalMergingRule.ALL)
+ .values().stream().flatMap(List::stream).collect(Collectors.toList());
+ long overlapLength = 0;
+ for (final Locatable overlap : mergedOverlaps) {
+ overlapLength += interval.intersect(overlap).size();
+ }
+ return overlapLength / (double) interval.getLengthOnReference() >= overlapFraction;
+ } else {
+ return true;
+ }
+ }
+
+ protected boolean matchesTrackBreakpointOverlap(final SVCallRecord record, final int numBreakpointOverlaps) {
+ if (numBreakpointOverlaps > 0 && !trackNames.isEmpty()) {
+ if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) {
+ throw new GATKException("Track overlap for CPX types not currently supported (" + name + ")");
+ }
+ final SimpleInterval intervalA = new SimpleInterval(record.getContigA(), record.getPositionA(), record.getPositionA());
+ final SimpleInterval intervalB = new SimpleInterval(record.getContigB(), record.getPositionB(), record.getPositionB());
+ return countAnyTrackOverlap(intervalA) + countAnyTrackOverlap(intervalB) >= numBreakpointOverlaps;
+ } else {
+ return true;
+ }
+ }
+
+ protected int countAnyTrackOverlap(final SimpleInterval interval) {
+ for (final String track : trackNames) {
+ if (trackMap.get(track).overlapsAny(interval)) {
+ return 1;
+ }
+ }
+ return 0;
+ }
+
+ public GATKSVVCFConstants.StructuralVariantAnnotationType getSvType() {
+ return svType;
+ }
+
+ public Integer getMinSize() {
+ return minSize;
+ }
+
+ public Integer getMaxSize() {
+ return maxSize;
+ }
+
+ public List getTrackNames() {
+ return trackNames;
+ }
+
+ public String getName() {
+ return name;
+ }
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStratificationEngineArgumentsCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStratificationEngineArgumentsCollection.java
new file mode 100644
index 00000000000..2fe8ebaabe4
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStratificationEngineArgumentsCollection.java
@@ -0,0 +1,70 @@
+package org.broadinstitute.hellbender.tools.sv.stratify;
+
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.hellbender.engine.GATKPath;
+import org.broadinstitute.hellbender.utils.tsv.TableUtils;
+
+import java.io.Serializable;
+import java.util.List;
+
+/**
+ * Arguments for use with {@link SVStatificationEngine}.
+ */
+public class SVStratificationEngineArgumentsCollection implements Serializable {
+ // Command-line arguments
+ public static final String STRATIFY_CONFIG_FILE_LONG_NAME = "stratify-config";
+ public static final String TRACK_NAME_FILE_LONG_NAME = "track-name";
+ public static final String TRACK_INTERVAL_FILE_LONG_NAME = "track-intervals";
+ public static final String OVERLAP_FRACTION_LONG_NAME = "stratify-overlap-fraction";
+ public static final String NUM_BREAKPOINT_OVERLAPS_LONG_NAME = "stratify-num-breakpoint-overlaps";
+ public static final String NUM_BREAKPOINT_INTERCHROM_OVERLAPS_LONG_NAME = "stratify-num-breakpoint-overlaps-interchromosomal";
+ private static final long serialVersionUID = 1L;
+
+ /**
+ * Expected format is tab-delimited and contains columns NAME, SVTYPE, MIN_SIZE, MAX_SIZE, track. First line must
+ * be a header with column names. Comment lines starting with {@link TableUtils#COMMENT_PREFIX} are ignored.
+ */
+ @Argument(
+ doc = "Stratification configuration file (.tsv)",
+ fullName = STRATIFY_CONFIG_FILE_LONG_NAME
+ )
+ public GATKPath configFile;
+
+ @Argument(
+ doc = "Track intervals file. Can be specified multiple times.",
+ fullName = TRACK_INTERVAL_FILE_LONG_NAME,
+ optional = true
+ )
+ public List trackFileList;
+
+ @Argument(
+ doc = "Track names. Must be once for each --" + TRACK_INTERVAL_FILE_LONG_NAME,
+ fullName = TRACK_NAME_FILE_LONG_NAME,
+ optional = true
+ )
+ public List trackNameList;
+
+ @Argument(
+ doc = "Minimum overlap fraction for tracks",
+ minValue = 0,
+ maxValue = 1,
+ fullName = OVERLAP_FRACTION_LONG_NAME
+ )
+ public double overlapFraction = 0;
+
+ @Argument(
+ doc = "Minimum number of variant endpoint overlaps for tracks",
+ minValue = 0,
+ maxValue = 2,
+ fullName = NUM_BREAKPOINT_OVERLAPS_LONG_NAME
+ )
+ public int numBreakpointOverlaps = 1;
+
+ @Argument(
+ doc = "Minimum number of breakpoint overlaps for tracks for interchromosomal variants (e.g. BNDs)",
+ minValue = 1,
+ maxValue = 2,
+ fullName = NUM_BREAKPOINT_INTERCHROM_OVERLAPS_LONG_NAME
+ )
+ public int numBreakpointOverlapsInterchrom = 1;
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster.java
new file mode 100644
index 00000000000..46b8569c379
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster.java
@@ -0,0 +1,243 @@
+package org.broadinstitute.hellbender.tools.walkers.sv;
+
+import htsjdk.variant.vcf.VCFHeader;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.ArgumentCollection;
+import org.broadinstitute.barclay.argparser.BetaFeature;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup;
+import org.broadinstitute.hellbender.engine.GATKPath;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
+import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
+import org.broadinstitute.hellbender.tools.sv.cluster.*;
+import org.broadinstitute.hellbender.tools.sv.stratify.SVStatificationEngine;
+import org.broadinstitute.hellbender.tools.sv.stratify.SVStratificationEngineArgumentsCollection;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.tsv.TableReader;
+import org.broadinstitute.hellbender.utils.tsv.TableUtils;
+
+import java.io.IOException;
+import java.util.*;
+import java.util.stream.Collectors;
+
+/**
+ * Clusters structural variants using the same base algorithms as {@link SVCluster}. In addition, variants are
+ * grouped according to customizable stratification criteria including:
+ *
+ * - SV type
+ * - Size range
+ * - Reference track overlap
+ *
+ * The first step is to define these groups in a stratification configuration TSV file. Please see the
+ * {@link SVStratify} tool for a description of the stratification method and expected table format.
+ *
+ * Each SV is only clustered with other SVs in its own group. Each group must be mutually exclusive, meaning that
+ * any given SV should only belong to one group. Furthermore, SVs that do not fall into any of the groups will not be
+ * clustered.
+ *
+ * The second step is to define the clustering configuration for each group. This is again done by creating a TSV
+ * file with the following columns defined on the first line:
+ *
+ * - NAME
+ * - RECIPROCAL_OVERLAP
+ * - SIZE_SIMILARITY
+ * - BREAKEND_WINDOW
+ * - SAMPLE_OVERLAP
+ *
+ * where NAME corresponds to the same name given in the stratification configuration. Every group needs to be given
+ * a configuration here. That is, there should be a 1:1 correspondence of the rows in the two configuration files
+ * (order does not matter).
+ *
+ *
+ * The remaining columns define the clustering parameters for the group. See {@link SVCluster} for more information
+ * on the different parameters. Note that, unlike {@link SVCluster}, distinct parameter sets for depth-only,
+ * PESR, and "mixed" clustering cannot be defined for this tool. Instead, the same parameters are applied to
+ * all three cases.
+ *
+ * For example,
+ *
+ *
+ * NAME | RECIPROCAL_OVERLAP | SIZE_SIMILARITY | BREAKEND_WINDOW | SAMPLE_OVERLAP |
+ *
+ *
+ * DEL_large_SD | 0.3 | 0.5 | 1000000 | 0.1 |
+ *
+ *
+ * DUP_large_SD | 0.3 | 0.5 | 1000000 | 0.1 |
+ *
+ *
+ * DEL_small_SR_RM | 0.5 | 0.5 | 100 | 0.1 |
+ *
+ *
+ * DUP_small_SR_RM | 0.5 | 0.5 | 100 | 0.1 |
+ *
+ *
+ * INS_SR | 0.5 | 0.5 | 100 | 0 |
+ *
+ *
+ *
+ * This tool accepts multiple VCF inputs with no restrictions on site or sample overlap.
+ *
+ * This tool does not support CNV defragmentation via the {@link #algorithm} parameter.
+ *
+ * Inputs
+ *
+ *
+ * -
+ * One or more SV VCFs
+ *
+ * -
+ * Stratification configuration TSV file
+ *
+ * -
+ * Clustering configuration TSV file
+ *
+ * -
+ * Reference fasta
+ *
+ *
+ *
+ * Output
+ *
+ *
+ * -
+ * Clustered VCF
+ *
+ *
+ *
+ * Usage example
+ *
+ *
+ * gatk GroupedSVCluster \
+ * -R reference.fasta \
+ * -V variants.vcf.gz \
+ * -O clustered.vcf.gz \
+ * --track-name repeatmasker \
+ * --track-intervals repeatmasker.bed \
+ * --stratify-config strata.tsv \
+ * --clustering-config cluster.tsv
+ *
+ *
+ * @author Mark Walker <markw@broadinstitute.org>
+ */
+@CommandLineProgramProperties(
+ summary = "Clusters structural variants within independent stratification groups",
+ oneLineSummary = "Clusters structural variants grouping by type, size, and track overlap",
+ programGroup = StructuralVariantDiscoveryProgramGroup.class
+)
+@BetaFeature
+@DocumentedFeature
+public final class GroupedSVCluster extends SVClusterWalker {
+ public static final String CLUSTERING_CONFIG_FILE_LONG_NAME = "clustering-config";
+
+ @ArgumentCollection
+ private final SVStratificationEngineArgumentsCollection stratArgs = new SVStratificationEngineArgumentsCollection();
+
+ /**
+ * Expected format is tab-delimited and contains columns NAME, RECIPROCAL_OVERLAP, SIZE_SIMILARITY, BREAKEND_WINDOW,
+ * SAMPLE_OVERLAP. First line must be a header with column names. Comment lines starting with
+ * {@link TableUtils#COMMENT_PREFIX} are ignored.
+ */
+ @Argument(
+ doc = "Configuration file (.tsv) containing the clustering parameters for each group",
+ fullName = CLUSTERING_CONFIG_FILE_LONG_NAME
+ )
+ public GATKPath strataClusteringConfigFile;
+
+ private SVStatificationEngine stratEngine;
+ private final Map clusterEngineMap = new HashMap<>();
+
+ @Override
+ public void onTraversalStart() {
+ super.onTraversalStart();
+ // sorting not guaranteed
+ createOutputVariantIndex = false;
+ stratEngine = SVStratify.loadStratificationConfig(stratArgs, dictionary);
+ Utils.validate(!stratEngine.getStrata().isEmpty(),
+ "No strata defined with --" + SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME);
+ readStrataClusteringConfig();
+ Utils.validate(stratEngine.getStrata().size() == clusterEngineMap.size(),
+ "Stratification and clustering configurations have a different number of groups.");
+ for (final SVStatificationEngine.Stratum stratum : stratEngine.getStrata()) {
+ Utils.validate(clusterEngineMap.containsKey(stratum.getName()),
+ "Could not find group " + stratum.getName() + " in clustering configuration.");
+ }
+ }
+
+ @Override
+ protected VCFHeader createHeader() {
+ final VCFHeader header = super.createHeader();
+ SVStratify.addStratifyMetadata(header);
+ return header;
+ }
+
+ private void readStrataClusteringConfig() {
+ try (final TableReader tableReader = TableUtils.reader(strataClusteringConfigFile.toPath(), StratifiedClusteringTableParser::tableParser)) {
+ for (final StratifiedClusteringTableParser.StratumParameters parameters : tableReader) {
+ // Identical parameters for each linkage type
+ final ClusteringParameters pesrParams = ClusteringParameters.createPesrParameters(parameters.reciprocalOverlap(), parameters.sizeSimilarity(), parameters.breakendWindow(), parameters.sampleOverlap());
+ final ClusteringParameters mixedParams = ClusteringParameters.createMixedParameters(parameters.reciprocalOverlap(), parameters.sizeSimilarity(), parameters.breakendWindow(), parameters.sampleOverlap());
+ final ClusteringParameters depthParams = ClusteringParameters.createDepthParameters(parameters.reciprocalOverlap(), parameters.sizeSimilarity(), parameters.breakendWindow(), parameters.sampleOverlap());
+ final SVClusterEngine clusterEngine = createClusteringEngine(pesrParams, mixedParams, depthParams);
+ clusterEngineMap.put(parameters.name(), clusterEngine);
+ }
+ } catch (final IOException e) {
+ throw new GATKException("IO error while reading config table", e);
+ }
+ }
+
+ private SVClusterEngine createClusteringEngine(final ClusteringParameters pesrParams, final ClusteringParameters mixedParams, final ClusteringParameters depthParams) {
+ if (algorithm == CLUSTER_ALGORITHM.SINGLE_LINKAGE || algorithm == CLUSTER_ALGORITHM.MAX_CLIQUE) {
+ final SVClusterEngine.CLUSTERING_TYPE type = algorithm == CLUSTER_ALGORITHM.SINGLE_LINKAGE ?
+ SVClusterEngine.CLUSTERING_TYPE.SINGLE_LINKAGE : SVClusterEngine.CLUSTERING_TYPE.MAX_CLIQUE;
+ return SVClusterEngineFactory.createCanonical(type, breakpointSummaryStrategy,
+ altAlleleSummaryStrategy, dictionary, reference, enableCnv,
+ depthParams, mixedParams, pesrParams);
+ } else {
+ throw new IllegalArgumentException("Unsupported algorithm: " + algorithm.name());
+ }
+ }
+
+ @Override
+ public Object onTraversalSuccess() {
+ for (final SVClusterEngine engine : clusterEngineMap.values()) {
+ engine.flush().stream().forEach(this::write);
+ }
+ return super.onTraversalSuccess();
+ }
+
+ @Override
+ public void closeTool() {
+ super.closeTool();
+ }
+
+ @Override
+ public void applyRecord(final SVCallRecord record) {
+ final Collection stratifications = stratEngine.getMatches(record,
+ stratArgs.overlapFraction, stratArgs.numBreakpointOverlaps, stratArgs.numBreakpointOverlapsInterchrom);
+ if (stratifications.size() > 1) {
+ // don't allow more than one match since it would proliferate variants
+ final String matchesString = String.join(", ", stratifications.stream().map(SVStatificationEngine.Stratum::getName).collect(Collectors.toList()));
+ throw new GATKException("Record " + record.getId() + " matched multiple groups: " + matchesString +
+ ". Groups must be mutually exclusive. Please modify the group configurations and/or tracks so that " +
+ "no variant can match more than one group.");
+ } else if (stratifications.isEmpty()) {
+ // no match, don't cluster
+ record.getAttributes().put(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, Collections.singletonList(record.getId()));
+ record.getAttributes().put(GATKSVVCFConstants.STRATUM_INFO_KEY, Collections.singletonList(SVStratify.DEFAULT_STRATUM));
+ write(record);
+ } else {
+ // exactly one match
+ final SVStatificationEngine.Stratum stratum = stratifications.iterator().next();
+ Utils.validate(clusterEngineMap.containsKey(stratum.getName()), "Group undefined: " + stratum.getName());
+ record.getAttributes().put(GATKSVVCFConstants.STRATUM_INFO_KEY, Collections.singletonList(stratum.getName()));
+ clusterAndWrite(record, clusterEngineMap.get(stratum.getName()));
+ }
+ }
+
+ private void clusterAndWrite(final SVCallRecord record, final SVClusterEngine clusterEngine) {
+ clusterEngine.addAndFlush(record).stream().forEach(this::write);
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentation.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentation.java
index e447d2c1086..473d5a0f3f6 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentation.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentation.java
@@ -105,6 +105,8 @@ public class JointGermlineCNVSegmentation extends MultiVariantWalkerGroupedOnSta
private SampleDB sampleDB;
private boolean isMultiSampleInput = false;
private ReferenceSequenceFile reference;
+ private Collection defragmentBuffer;
+ private Collection outputBuffer;
private final Set allosomalContigs = new LinkedHashSet<>(Arrays.asList("X","Y","chrX","chrY"));
class CopyNumberAndEndRecord {
@@ -132,6 +134,7 @@ public int getEndPosition() {
public static final String MODEL_CALL_INTERVALS_LONG_NAME = "model-call-intervals";
public static final String BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME = "breakpoint-summary-strategy";
public static final String ALT_ALLELE_SUMMARY_STRATEGY_LONG_NAME = "alt-allele-summary-strategy";
+ public static final String FLAG_FIELD_LOGIC_LONG_NAME = "flag-field-logic";
@Argument(fullName = MIN_QUALITY_LONG_NAME, doc = "Minimum QS score to combine a variant segment", optional = true)
private int minQS = 20;
@@ -200,6 +203,13 @@ public boolean requiresReference() {
// Cannot require sample overlap when clustering across samples
private static final double CLUSTER_SAMPLE_OVERLAP_FRACTION = 0;
+ @Argument(fullName = SVClusterWalker.MAX_RECORDS_IN_RAM_LONG_NAME,
+ doc = "When writing VCF files that need to be sorted, this will specify the number of records stored in " +
+ "RAM before spilling to disk. Increasing this number reduces the number of file handles needed to sort a " +
+ "VCF file, and increases the amount of RAM needed.",
+ optional=true)
+ public int maxRecordsInRam = 10000;
+
@Override
public void onTraversalStart() {
reference = ReferenceUtils.createReferenceReader(referenceArguments.getReferenceSpecifier());
@@ -223,6 +233,8 @@ public void onTraversalStart() {
clusterEngine = SVClusterEngineFactory.createCanonical(SVClusterEngine.CLUSTERING_TYPE.MAX_CLIQUE, breakpointSummaryStrategy, altAlleleSummaryStrategy,
dictionary, reference, true, clusterArgs, CanonicalSVLinkage.DEFAULT_MIXED_PARAMS, CanonicalSVLinkage.DEFAULT_PESR_PARAMS);
+ defragmentBuffer = new ArrayList<>();
+ outputBuffer = new ArrayList<>();
vcfWriter = getVCFWriter();
if (getSamplesForVariants().size() != 1) {
@@ -285,14 +297,38 @@ public void apply(final List variantContexts, final ReferenceCon
final SVCallRecord record = createDepthOnlyFromGCNVWithOriginalGenotypes(vc, minQS, allosomalContigs, refAutosomalCopyNumber, sampleDB);
if (record != null) {
if (!isMultiSampleInput) {
- defragmenter.add(record);
+ bufferDefragmenterOutput(defragmenter.addAndFlush(record));
} else {
- clusterEngine.add(record);
+ bufferClusterOutput(clusterEngine.addAndFlush(record));
}
}
}
}
+ private void bufferDefragmenterOutput(final List records) {
+ defragmentBuffer.addAll(records);
+ }
+
+ private List flushDefragmenterBuffer() {
+ final List result = defragmentBuffer.stream()
+ .sorted(Comparator.comparingInt(SVCallRecord::getPositionA))
+ .collect(Collectors.toUnmodifiableList());
+ defragmentBuffer = new ArrayList<>();
+ return result;
+ }
+
+ private void bufferClusterOutput(final List records) {
+ outputBuffer.addAll(records);
+ }
+
+ private List flushClusterBuffer() {
+ final List result = outputBuffer.stream()
+ .sorted(Comparator.comparingInt(SVCallRecord::getPositionA))
+ .collect(Collectors.toUnmodifiableList());
+ outputBuffer = new ArrayList<>();
+ return result;
+ }
+
@Override
public Object onTraversalSuccess() {
processClusters();
@@ -305,11 +341,16 @@ public Object onTraversalSuccess() {
* new contig.
*/
private void processClusters() {
- final List defragmentedCalls = defragmenter.forceFlush();
- defragmentedCalls.stream().forEachOrdered(clusterEngine::add);
+ bufferDefragmenterOutput(defragmenter.flush());
//Jack and Isaac cluster first and then defragment
- final List clusteredCalls = clusterEngine.forceFlush();
- write(clusteredCalls);
+ bufferClusterOutput(
+ flushDefragmenterBuffer().stream()
+ .map(clusterEngine::addAndFlush)
+ .flatMap(List::stream)
+ .collect(Collectors.toList())
+ );
+ bufferClusterOutput(clusterEngine.flush());
+ write(flushClusterBuffer());
}
private VariantContext buildAndSanitizeRecord(final SVCallRecord record) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster.java
index 791befb79fb..cd280857a63 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster.java
@@ -1,32 +1,13 @@
package org.broadinstitute.hellbender.tools.walkers.sv;
-import htsjdk.samtools.SAMSequenceDictionary;
-import htsjdk.samtools.reference.ReferenceSequenceFile;
-import htsjdk.variant.variantcontext.GenotypesContext;
-import htsjdk.variant.variantcontext.VariantContext;
-import htsjdk.variant.variantcontext.VariantContextBuilder;
-import htsjdk.variant.variantcontext.writer.VariantContextWriter;
-import htsjdk.variant.vcf.*;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
-import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup;
-import org.broadinstitute.hellbender.engine.*;
-import org.broadinstitute.hellbender.exceptions.UserException;
-import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
-import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFHeaderLines;
import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
-import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils;
import org.broadinstitute.hellbender.tools.sv.cluster.*;
-import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;
-
-import java.util.List;
-import java.util.Set;
-
-import static org.broadinstitute.hellbender.tools.walkers.sv.JointGermlineCNVSegmentation.BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME;
/**
* Clusters structural variants based on coordinates, event type, and supporting algorithms. Primary use cases include:
@@ -178,111 +159,10 @@
)
@BetaFeature
@DocumentedFeature
-public final class SVCluster extends MultiVariantWalker {
- public static final String PLOIDY_TABLE_LONG_NAME = "ploidy-table";
- public static final String VARIANT_PREFIX_LONG_NAME = "variant-prefix";
- public static final String ENABLE_CNV_LONG_NAME = "enable-cnv";
+public final class SVCluster extends SVClusterWalker {
+
public static final String DEFRAG_PADDING_FRACTION_LONG_NAME = "defrag-padding-fraction";
public static final String DEFRAG_SAMPLE_OVERLAP_LONG_NAME = "defrag-sample-overlap";
- public static final String CONVERT_INV_LONG_NAME = "convert-inv-to-bnd";
- public static final String ALGORITHM_LONG_NAME = "algorithm";
- public static final String FAST_MODE_LONG_NAME = "fast-mode";
- public static final String OMIT_MEMBERS_LONG_NAME = "omit-members";
- public static final String DEFAULT_NO_CALL_LONG_NAME = "default-no-call";
-
- /**
- * The enum Cluster algorithm.
- */
- enum CLUSTER_ALGORITHM {
- /**
- * Defragment cnv cluster algorithm.
- */
- DEFRAGMENT_CNV,
- /**
- * Single linkage cluster algorithm.
- */
- SINGLE_LINKAGE,
- /**
- * Max clique cluster algorithm.
- */
- MAX_CLIQUE
- }
-
- @Argument(
- doc = "Output VCF",
- fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
- shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME
- )
- private GATKPath outputFile;
-
- /**
- * Expected format is tab-delimited and contains a header with the first column SAMPLE and remaining columns
- * contig names. Each row corresponds to a sample, with the sample ID in the first column and contig ploidy
- * integers in their respective columns.
- */
- @Argument(
- doc = "Sample ploidy table (.tsv)",
- fullName = PLOIDY_TABLE_LONG_NAME
- )
- private GATKPath ploidyTablePath;
-
- @Argument(
- doc = "If supplied, generate variant IDs with this prefix",
- fullName = VARIANT_PREFIX_LONG_NAME,
- optional = true
- )
- private String variantPrefix = null;
-
- /**
- * When enabled, DEL and DUP variants will be clustered together. The resulting records with have an SVTYPE of CNV.
- */
- @Argument(
- doc = "Enable clustering DEL/DUP variants together as CNVs (does not apply to CNV defragmentation)",
- fullName = ENABLE_CNV_LONG_NAME,
- optional = true
- )
- private boolean enableCnv = false;
-
- /**
- * When enabled, INV records will be converted to a pairs of BNDs prior to clustering.
- */
- @Argument(
- doc = "Convert inversions to BND records",
- fullName = CONVERT_INV_LONG_NAME,
- optional = true
- )
- private boolean convertInversions = false;
-
- /**
- * Results in substantial space and time costs for large sample sets by clearing genotypes that are not needed for
- * clustering, but any associated annotation fields will be set to null in the output.
- */
- @Argument(
- doc = "Fast mode. Drops hom-ref and no-call genotype fields and emits them as no-calls.",
- fullName = FAST_MODE_LONG_NAME,
- optional = true
- )
- private boolean fastMode = false;
-
- @Argument(
- doc = "Omit cluster member ID annotations",
- fullName = OMIT_MEMBERS_LONG_NAME,
- optional = true
- )
- private boolean omitMembers = false;
-
- @Argument(fullName = BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME,
- doc = "Strategy to use for choosing a representative value for a breakpoint cluster.",
- optional = true)
- private CanonicalSVCollapser.BreakpointSummaryStrategy breakpointSummaryStrategy =
- CanonicalSVCollapser.BreakpointSummaryStrategy.REPRESENTATIVE;
-
- @Argument(fullName = JointGermlineCNVSegmentation.ALT_ALLELE_SUMMARY_STRATEGY_LONG_NAME,
- doc = "Strategy to use for choosing a representative alt allele for non-CNV biallelic sites with " +
- "different subtypes.",
- optional = true)
- private CanonicalSVCollapser.AltAlleleSummaryStrategy altAlleleSummaryStrategy =
- CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE;
@Argument(fullName = DEFRAG_PADDING_FRACTION_LONG_NAME,
doc = "Padding as a fraction of variant length for CNV defragmentation mode.",
@@ -296,51 +176,14 @@ enum CLUSTER_ALGORITHM {
)
private double defragSampleOverlapFraction = CNVLinkage.DEFAULT_SAMPLE_OVERLAP;
- @Argument(fullName = ALGORITHM_LONG_NAME,
- doc = "Clustering algorithm",
- optional = true
- )
- private CLUSTER_ALGORITHM algorithm = CLUSTER_ALGORITHM.SINGLE_LINKAGE;
-
- /**
- * Default genotypes are assigned when they cannot be inferred from the inputs, such as when VCFs with different
- * variants and samples are provided.
- */
- @Argument(fullName = DEFAULT_NO_CALL_LONG_NAME,
- doc = "Default to no-call GT (e.g. ./.) instead of reference alleles (e.g. 0/0) when a genotype is not" +
- " available",
- optional = true
- )
- private boolean defaultNoCall = false;
-
@ArgumentCollection
private final SVClusterEngineArgumentsCollection clusterParameterArgs = new SVClusterEngineArgumentsCollection();
- private SAMSequenceDictionary dictionary;
- private ReferenceSequenceFile reference;
- private PloidyTable ploidyTable;
- private VariantContextWriter writer;
- private VCFHeader header;
- private SVClusterEngine clusterEngine;
- private Set samples;
- private String currentContig;
- private int numVariantsBuilt = 0;
-
- @Override
- public boolean requiresReference() {
- return true;
- }
+ protected SVClusterEngine clusterEngine;
@Override
public void onTraversalStart() {
- reference = ReferenceUtils.createReferenceReader(referenceArguments.getReferenceSpecifier());
- dictionary = reference.getSequenceDictionary();
- if (dictionary == null) {
- throw new UserException("Reference sequence dictionary required");
- }
- ploidyTable = new PloidyTable(ploidyTablePath.toPath());
- samples = getSamplesForVariants();
-
+ super.onTraversalStart();
if (algorithm == CLUSTER_ALGORITHM.DEFRAGMENT_CNV) {
clusterEngine = SVClusterEngineFactory.createCNVDefragmenter(dictionary, altAlleleSummaryStrategy,
reference, defragPaddingFraction, defragSampleOverlapFraction);
@@ -354,107 +197,21 @@ public void onTraversalStart() {
} else {
throw new IllegalArgumentException("Unsupported algorithm: " + algorithm.name());
}
-
- writer = createVCFWriter(outputFile);
- header = createHeader();
- writer.writeHeader(header);
- currentContig = null;
}
@Override
public Object onTraversalSuccess() {
- write(true);
+ clusterEngine.flush().stream().forEach(this::write);
return super.onTraversalSuccess();
}
@Override
public void closeTool() {
super.closeTool();
- if (writer != null) {
- writer.close();
- }
}
@Override
- public void apply(final VariantContext variant, final ReadsContext readsContext,
- final ReferenceContext referenceContext, final FeatureContext featureContext) {
- final SVCallRecord call = SVCallRecordUtils.create(variant, dictionary);
- final SVCallRecord filteredCall;
- if (fastMode && call.getType() != GATKSVVCFConstants.StructuralVariantAnnotationType.CNV) {
- // Strip out non-carrier genotypes to save memory and compute
- // Don't do for multi-allelic CNVs since carrier status can't be determined
- final GenotypesContext filteredGenotypes = GenotypesContext.copy(call.getCarrierGenotypeList());
- filteredCall = SVCallRecordUtils.copyCallWithNewGenotypes(call, filteredGenotypes);
- } else {
- filteredCall = call;
- }
-
- // Update current contig
- if (!filteredCall.getContigA().equals(currentContig)) {
- currentContig = filteredCall.getContigA();
- logger.info("Processing contig " + currentContig + "...");
- }
-
- // Add to clustering buffer
- if (convertInversions) {
- SVCallRecordUtils.convertInversionsToBreakends(filteredCall, dictionary).forEachOrdered(clusterEngine::add);
- } else {
- clusterEngine.add(filteredCall);
- }
-
- write(false);
- }
-
- private void write(final boolean force) {
- final List records = force ? clusterEngine.forceFlush() : clusterEngine.flush();
- records.stream().map(this::buildVariantContext).forEachOrdered(writer::add);
+ public void applyRecord(final SVCallRecord record) {
+ clusterEngine.addAndFlush(record).stream().forEach(this::write);
}
-
- private VCFHeader createHeader() {
- final VCFHeader header = new VCFHeader(getHeaderForVariants().getMetaDataInInputOrder(), samples);
- header.setSequenceDictionary(dictionary);
-
- // Required info lines
- header.addMetaDataLine(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
- header.addMetaDataLine(GATKSVVCFHeaderLines.getInfoLine(GATKSVVCFConstants.SVLEN));
- header.addMetaDataLine(GATKSVVCFHeaderLines.getInfoLine(GATKSVVCFConstants.SVTYPE));
- header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.END2_ATTRIBUTE, 1,
- VCFHeaderLineType.Integer, "Second position"));
- header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.CONTIG2_ATTRIBUTE, 1,
- VCFHeaderLineType.String, "Second contig"));
- header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.STRANDS_ATTRIBUTE, 1,
- VCFHeaderLineType.String, "First and second strands"));
- header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE,
- VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Source algorithms"));
- if (!omitMembers) {
- header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY,
- VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Cluster variant ids"));
- }
-
- // Required format lines
- header.addMetaDataLine(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY));
-
- return header;
- }
-
- public VariantContext buildVariantContext(final SVCallRecord call) {
- // Add genotypes for missing samples
- final GenotypesContext filledGenotypes = SVCallRecordUtils.populateGenotypesForMissingSamplesWithAlleles(
- call, samples, !defaultNoCall, ploidyTable, header);
-
- // Assign new variant ID
- final String newId = variantPrefix == null ? call.getId() : String.format("%s%08x", variantPrefix, numVariantsBuilt++);
-
- // Build new variant
- final SVCallRecord finalCall = new SVCallRecord(newId, call.getContigA(), call.getPositionA(), call.getStrandA(),
- call.getContigB(), call.getPositionB(), call.getStrandB(), call.getType(), call.getComplexSubtype(),
- call.getComplexEventIntervals(), call.getLength(), call.getAlgorithms(), call.getAlleles(), filledGenotypes,
- call.getAttributes(), call.getFilters(), call.getLog10PError(), dictionary);
- final VariantContextBuilder builder = SVCallRecordUtils.getVariantBuilder(finalCall);
- if (omitMembers) {
- builder.rmAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY);
- }
- return builder.make();
- }
-
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance.java
index 56c389539c6..18f405cef06 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance.java
@@ -199,7 +199,7 @@ protected SVCallRecord minimizeTruthFootprint(final SVCallRecord item) {
final List genotypes = item.getGenotypes().stream().map(SVConcordance::stripTruthGenotype).collect(Collectors.toList());
return new SVCallRecord(item.getId(), item.getContigA(), item.getPositionA(),
item.getStrandA(), item.getContigB(), item.getPositionB(), item.getStrandB(), item.getType(),
- item.getComplexSubtype(), item.getComplexEventIntervals(), item.getLength(), item.getAlgorithms(),
+ item.getComplexSubtype(), item.getComplexEventIntervals(), item.getLength(), item.getEvidence(), item.getAlgorithms(),
item.getAlleles(), genotypes, item.getAttributes(), item.getFilters(), item.getLog10PError(), dictionary);
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify.java
new file mode 100644
index 00000000000..0dc560c90ca
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify.java
@@ -0,0 +1,330 @@
+package org.broadinstitute.hellbender.tools.walkers.sv;
+
+import htsjdk.samtools.SAMSequenceDictionary;
+import htsjdk.samtools.util.Locatable;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.variantcontext.VariantContextBuilder;
+import htsjdk.variant.variantcontext.writer.VariantContextWriter;
+import htsjdk.variant.vcf.VCFHeader;
+import htsjdk.variant.vcf.VCFHeaderLineType;
+import htsjdk.variant.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.ArgumentCollection;
+import org.broadinstitute.barclay.argparser.BetaFeature;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup;
+import org.broadinstitute.hellbender.engine.*;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberStandardArgument;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
+import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
+import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils;
+import org.broadinstitute.hellbender.tools.sv.stratify.SVStatificationEngine;
+import org.broadinstitute.hellbender.tools.sv.stratify.SVStratificationEngineArgumentsCollection;
+import org.broadinstitute.hellbender.utils.*;
+
+import java.io.File;
+import java.nio.file.Path;
+import java.util.*;
+import java.util.stream.Collectors;
+
+/**
+ * Stratifies structural variants into mutually exclusive groups according to the following customizable criteria:
+ *
+ * - SV type
+ * - Size range
+ * - Reference track overlap
+ *
+ * Records are annotated with their respective strata names in the {@link GATKSVVCFConstants#STRATUM_INFO_KEY} INFO
+ * field. Users must provide a stratification configuration .tsv file (tab-delimited table) with the following column
+ * header on the first line:
+ *
+ * - NAME
+ * - SVTYPE
+ * - MIN_SIZE
+ * - MAX_SIZE
+ * - TRACKS
+ *
+ *
+ * For example:
+ *
+ *
+ * NAME | SVTYPE | MIN_SIZE | MAX_SIZE | TRACKS |
+ *
+ *
+ * DEL_large_SD | DEL | 5000 | -1 | SD |
+ *
+ *
+ * DUP_large_SD | DUP | 5000 | -1 | SD |
+ *
+ *
+ * DEL_small_SR_RM | DEL | -1 | 5000 | SR,RM |
+ *
+ *
+ * DUP_small_SR_RM | DUP | -1 | 5000 | SR,RM |
+ *
+ *
+ * INS_SR | INS | -1 | -1 | SR |
+ *
+ *
+ *
+ * The "NAME" column is an arbitrary identifier, "SVTYPE" is the class of variant (DEL, DUP, INS, etc.), MIN_SIZE in an
+ * inclusive size lower-bound, MAX_SIZE is an exclusive size upper-bound, and TRACKS is a comma-delimited list of
+ * reference tracks defined using the {@link SVStratificationEngineArgumentsCollection#trackFileList} and
+ * {@link SVStratificationEngineArgumentsCollection#trackNameList} parameters. For example,
+ *
+ *
+ * gatk GroupedSVCluster \
+ * --track-name RM \
+ * --track-intervals repeatmasker.bed \
+ * --track-name SD \
+ * --track-intervals segmental_duplications.bed \
+ * --track-name SR \
+ * --track-intervals simple_repeats.bed \
+ * ...
+ *
+ *
+ * The MIN_SIZE, MAX_SIZE, and TRACKS columns may contain null values {"-1", "", "NULL", "NA"}. Null MIN_SIZE and
+ * MAX_SIZE correspond to negative and positive infinity, respectively, and a null TRACKS value means that variants
+ * will not be matched based on track. Variants with undefined SVLEN will only match if both MIN_SIZE and MAX_SIZE
+ * are null.
+ *
+ *
+ * The {@link SVStratificationEngineArgumentsCollection#overlapFraction},
+ * {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlaps}, and
+ * {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlapsInterchrom} can be used to modify the overlap
+ * criteria for assigning variants to each group based on overlap with the given reference track intervals. By
+ * default, only one endpoint of the variant needs to lie in a track interval in order to match. INS variants are
+ * treated as single points and only {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlaps} is used,
+ * ignoring {@link SVStratificationEngineArgumentsCollection#overlapFraction}. Similarly, CTX and BND variant
+ * overlap is only defined by {@link SVStratificationEngineArgumentsCollection#numBreakpointOverlapsInterchrom}.
+ *
+ *
+ * By default, each stratification group must be mutually exclusive, meaning that any given SV can only belong to
+ * one group. An error is thrown if the tool encounters a variant that meets the criteria for more than one group.
+ * This restriction can be overridden with the {@link SVStratify#ALLOW_MULTIPLE_MATCHES_LONG_NAME} argument, in which
+ * case the record will be written out multiple times: once for each matching stratification group with the corresponding
+ * {@link GATKSVVCFConstants#STRATUM_INFO_KEY} value. Furthermore, SVs that do not match any of the groups will be
+ * annotated with the {@link SVStratify#DEFAULT_STRATUM} group.
+ *
+ * If using {@link #SPLIT_OUTPUT_LONG_NAME} then the tool generates a set of VCFs as output with each VCF containing
+ * the records of each group.
+ *
+ * This tool accepts multiple VCF inputs with no restrictions on site or sample overlap.
+ *
+ * Inputs
+ *
+ *
+ * -
+ * One or more SV VCFs
+ *
+ * -
+ * Stratification configuration TSV file
+ *
+ * -
+ * Reference dictionary
+ *
+ *
+ *
+ * Output
+ *
+ *
+ * -
+ * Annotated VCF(s)
+ *
+ *
+ *
+ * Usage example, generating stratified VCFs:
+ *
+ *
+ * gatk SVStratify \
+ * -V variants.vcf.gz \
+ * --split-output \
+ * -O ./ \
+ * --output-prefix out \
+ * --sequence-dictionary reference.dict \
+ * --track-name RM \
+ * --track-intervals repeatmasker.bed \
+ * --stratify-config strata.tsv
+ *
+ *
+ * Usage example, a single annotated VCF:
+ *
+ *
+ * gatk SVStratify \
+ * -V variants.vcf.gz \
+ * -O out.vcf.gz \
+ * --sequence-dictionary reference.dict \
+ * --track-name RM \
+ * --track-intervals repeatmasker.bed \
+ * --stratify-config strata.tsv
+ *
+ *
+ * @author Mark Walker <markw@broadinstitute.org>
+ */
+@CommandLineProgramProperties(
+ summary = "Annotates variants by SV type, size, and reference tracks",
+ oneLineSummary = "Annotates variants by SV type, size, and reference tracks",
+ programGroup = StructuralVariantDiscoveryProgramGroup.class
+)
+@BetaFeature
+@DocumentedFeature
+public final class SVStratify extends MultiVariantWalker {
+
+ public static final String ALLOW_MULTIPLE_MATCHES_LONG_NAME = "allow-multiple-matches";
+ public static final String SPLIT_OUTPUT_LONG_NAME = "split-output";
+
+ // Default output group name for unmatched records
+ public static final String DEFAULT_STRATUM = "default";
+
+ @Argument(
+ doc = "Output path. Must be a directory if using --" + SPLIT_OUTPUT_LONG_NAME,
+ fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
+ shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME
+ )
+ private GATKPath outputPath;
+
+ @Argument(
+ doc = "Prefix for output filenames, only if using --" + SPLIT_OUTPUT_LONG_NAME,
+ fullName = CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME,
+ optional = true
+ )
+ private String outputPrefix;
+
+ @ArgumentCollection
+ private final SVStratificationEngineArgumentsCollection stratArgs = new SVStratificationEngineArgumentsCollection();
+
+ @Argument(
+ doc = "Do not enforce mutual exclusivity for each stratification group",
+ fullName = ALLOW_MULTIPLE_MATCHES_LONG_NAME
+ )
+ private boolean allowMultipleMatches = false;
+
+ @Argument(
+ doc = "Split output into multiple VCFs, one per stratification group. If used, then --" +
+ StandardArgumentDefinitions.OUTPUT_LONG_NAME + " must be the output directory and --" +
+ CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME + " must be provided.",
+ fullName = SPLIT_OUTPUT_LONG_NAME
+ )
+ private boolean splitOutput = false;
+
+ protected SAMSequenceDictionary dictionary;
+ protected Map writers;
+ protected SVStatificationEngine engine;
+
+ @Override
+ public void onTraversalStart() {
+ super.onTraversalStart();
+ dictionary = getMasterSequenceDictionary();
+ Utils.validateArg(dictionary != null, "Reference dictionary is required; please specify with --" +
+ StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME);
+ engine = loadStratificationConfig(stratArgs, dictionary);
+ logger.debug("Loaded stratification groups:");
+ for (final SVStatificationEngine.Stratum s : engine.getStrata()) {
+ logger.debug(s);
+ }
+ initializeWriters();
+ }
+
+ protected void createGroupWriter(final String name, final Path path) {
+ final VariantContextWriter writer = createVCFWriter(path);
+ final VCFHeader header = new VCFHeader(getHeaderForVariants());
+ addStratifyMetadata(header);
+ writer.writeHeader(header);
+ if (writers.containsKey(name)) {
+ throw new GATKException.ShouldNeverReachHereException("Stratification name already exists: " + name);
+ }
+ writers.put(name, writer);
+ }
+
+ public static void addStratifyMetadata(final VCFHeader header) {
+ header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.STRATUM_INFO_KEY, 1,
+ VCFHeaderLineType.String, "Stratum ID"));
+ }
+
+ protected Path generateGroupOutputPath(final String name) {
+ final String filename = outputPrefix + "." + name + ".vcf.gz";
+ return outputPath.toPath().resolve(filename);
+ }
+
+ protected void initializeWriters() {
+ writers = new HashMap<>();
+ if (splitOutput) {
+ Utils.validateArg(outputPrefix != null, "Argument --" + CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME + " required if using --" + SPLIT_OUTPUT_LONG_NAME);
+ Utils.validateArg(new File(outputPath.toString()).isDirectory(), "Argument --" + StandardArgumentDefinitions.OUTPUT_LONG_NAME + " must be a directory if using " + SPLIT_OUTPUT_LONG_NAME);
+ createGroupWriter(DEFAULT_STRATUM, generateGroupOutputPath(DEFAULT_STRATUM));
+ for (final SVStatificationEngine.Stratum s : engine.getStrata()) {
+ createGroupWriter(s.getName(), generateGroupOutputPath(s.getName()));
+ }
+ } else {
+ createGroupWriter(DEFAULT_STRATUM, outputPath.toPath());
+ }
+ }
+
+ /**
+ * Reusable method for loading the stratification configuration table. See tool doc for the expected format.
+ */
+ public static SVStatificationEngine loadStratificationConfig(final SVStratificationEngineArgumentsCollection args,
+ final SAMSequenceDictionary dictionary) {
+ Utils.validateArg(args.trackNameList.size() == args.trackFileList.size(), "Arguments --" +
+ SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME + " and --" + SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME +
+ " must be specified the same number of times.");
+ final Map> map = new HashMap<>();
+ final Iterator nameIterator = args.trackNameList.iterator();
+ final Iterator pathIterator = args.trackFileList.iterator();
+ final GenomeLocParser genomeLocParser = new GenomeLocParser(dictionary);
+ while (nameIterator.hasNext() && pathIterator.hasNext()) {
+ final String name = nameIterator.next();
+ final GATKPath path = pathIterator.next();
+ final GenomeLocSortedSet genomeLocs = IntervalUtils.loadIntervals(Collections.singletonList(path.toString()), IntervalSetRule.UNION, IntervalMergingRule.ALL, 0, genomeLocParser);
+ final List intervals = Collections.unmodifiableList(genomeLocs.toList());
+ if (map.containsKey(name)) {
+ throw new UserException.BadInput("Duplicate track name was specified: " + name);
+ }
+ map.put(name, intervals);
+ }
+ final SVStatificationEngine engine = SVStatificationEngine.create(map, args.configFile, dictionary);
+ if (engine.getStrata().stream().anyMatch(s -> s.getName().equals(DEFAULT_STRATUM))) {
+ throw new UserException.BadInput("Stratification configuration contains entry with reserved " +
+ "ID \"" + DEFAULT_STRATUM + "\"");
+ }
+ return engine;
+ }
+
+ @Override
+ public void closeTool() {
+ for (final VariantContextWriter writer : writers.values()) {
+ writer.close();
+ }
+ super.closeTool();
+ }
+
+ @Override
+ public void apply(final VariantContext variant, final ReadsContext readsContext,
+ final ReferenceContext referenceContext, final FeatureContext featureContext) {
+ // Save a ton of compute by not copying genotypes into the new record
+ final VariantContext variantNoGenotypes = new VariantContextBuilder(variant).genotypes(Collections.emptyList()).make();
+ final SVCallRecord record = SVCallRecordUtils.create(variantNoGenotypes, dictionary);
+ final Collection stratifications = engine.getMatches(record,
+ stratArgs.overlapFraction, stratArgs.numBreakpointOverlaps, stratArgs.numBreakpointOverlapsInterchrom);
+ final VariantContextBuilder builder = new VariantContextBuilder(variant);
+ if (stratifications.isEmpty()) {
+ writers.get(DEFAULT_STRATUM).add(builder.attribute(GATKSVVCFConstants.STRATUM_INFO_KEY, DEFAULT_STRATUM).make());
+ } else {
+ if (!allowMultipleMatches && stratifications.size() > 1) {
+ final String matchesString = String.join(", ", stratifications.stream().map(SVStatificationEngine.Stratum::getName).collect(Collectors.toList()));
+ throw new GATKException("Record " + record.getId() + " matched multiple groups: " + matchesString + ". Bypass this error using the --" + ALLOW_MULTIPLE_MATCHES_LONG_NAME + " argument");
+ }
+ for (final SVStatificationEngine.Stratum stratum : stratifications) {
+ final VariantContextWriter writer = splitOutput ? writers.get(stratum.getName()) : writers.get(DEFAULT_STRATUM);
+ if (writer == null) {
+ throw new GATKException("Writer not found for group: " + stratum.getName());
+ }
+ writer.add(builder.attribute(GATKSVVCFConstants.STRATUM_INFO_KEY, stratum.getName()).make());
+ }
+ }
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VariantRecalibrator.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VariantRecalibrator.java
index 5b2b44a3f50..021c2d5cccb 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VariantRecalibrator.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VariantRecalibrator.java
@@ -1118,7 +1118,7 @@ private void createVisualizationScript(
stream.println("dummyData$x <- NaN");
stream.println("dummyData$y <- NaN");
stream.println("p <- ggplot(data=" + surfaceFrame + ", aes(x=x, y=y)) +theme(panel.background = element_rect(fill = \"white\"), panel.grid.minor = element_line(colour = \"white\"), panel.grid.major = element_line(colour = \"white\"))");
- stream.println("p1 = p +ggtitle(\"model PDF\") + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + geom_tile(aes(fill = lod)) + scale_fill_gradient(high=\"green\", low=\"red\", space=\"rgb\")");
+ stream.println("p1 = p +ggtitle(\"model PDF\") + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + geom_tile(aes(fill = lod)) + scale_fill_gradient(high=\"green\", low=\"red\", space=\"Lab\")");
stream.println("p <- qplot(x,y,data=" + dataFrame + ", color=retained, alpha=I(1/7),legend=FALSE) +theme(panel.background = element_rect(fill = \"white\"), panel.grid.minor = element_line(colour = \"white\"), panel.grid.major = element_line(colour = \"white\"))");
stream.println("q <- geom_point(aes(x=x,y=y,color=retained),data=dummyData, alpha=1.0, na.rm=TRUE)");
stream.println("p2 = p + q + labs(x=\""+ annotationKeys[iii] +"\", y=\""+ annotationKeys[jjj] +"\") + scale_colour_gradient(name=\"outcome\", high=\"black\", low=\"red\",breaks=c(-1,1),guide=\"legend\",labels=c(\"filtered\",\"retained\"))");
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/variant/GATKVariantContextUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/variant/GATKVariantContextUtils.java
index 24854ccad02..68cc127ea66 100644
--- a/src/main/java/org/broadinstitute/hellbender/utils/variant/GATKVariantContextUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/utils/variant/GATKVariantContextUtils.java
@@ -35,11 +35,14 @@
import java.nio.file.Path;
import java.util.*;
import java.util.function.BiFunction;
+import java.util.stream.Collector;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
public final class GATKVariantContextUtils {
+ /** maximum number of sources to include when merging sources */
+ private static final int MAX_SOURCES_TO_INCLUDE = 10;
private static final Logger logger = LogManager.getLogger(GATKVariantContextUtils.class);
public static final String MERGE_FILTER_PREFIX = "filterIn";
@@ -1097,31 +1100,46 @@ public static VariantContext simpleMerge(final Collection unsort
final GenotypeMergeType genotypeMergeOptions,
final boolean filteredAreUncalled) {
int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size();
- return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, filteredAreUncalled);
+ return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, filteredAreUncalled, false, -1);
+ }
+
+ public static VariantContext simpleMerge(final Collection unsortedVCs,
+ final List priorityListOfVCs,
+ final FilteredRecordMergeType filteredRecordMergeType,
+ final GenotypeMergeType genotypeMergeOptions,
+ final boolean filteredAreUncalled,
+ final boolean storeAllVcfSources,
+ final int maxSourceFieldLength) {
+ int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size();
+ return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, filteredAreUncalled, storeAllVcfSources, maxSourceFieldLength);
}
/**
- * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided.
- * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
- * the sample name.
- * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use
- * SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge.
- *
- * For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/
- *
- * @param unsortedVCs collection of unsorted VCs
- * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs
- * @param filteredRecordMergeType merge type for filtered records
- * @param genotypeMergeOptions merge option for genotypes
- * @param filteredAreUncalled are filtered records uncalled?
- * @return new VariantContext representing the merge of unsortedVCs
- */
+ * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided.
+ * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
+ * the sample name.
+ * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use
+ * SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge.
+ *
+ * For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/
+ *
+ * @param unsortedVCs collection of unsorted VCs
+ * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs
+ * @param filteredRecordMergeType merge type for filtered records
+ * @param genotypeMergeOptions merge option for genotypes
+ * @param filteredAreUncalled are filtered records uncalled?
+ * @param storeAllVcfSources if true, the sources of all VCs where isVariable()=true will be concatenated in the output VC's source field. If false, the source of the first VC will be used. This mirror's GATK3's behavior
+ * @param maxSourceFieldLength This can be used to enforce a maximum length for the value of the source field (primarily useful if storeAllVcfSources=true). Set to -1 for unlimited
+ * @return new VariantContext representing the merge of unsortedVCs
+ */
public static VariantContext simpleMerge(final Collection unsortedVCs,
final List priorityListOfVCs,
final int originalNumOfVCs,
final FilteredRecordMergeType filteredRecordMergeType,
final GenotypeMergeType genotypeMergeOptions,
- final boolean filteredAreUncalled) {
+ final boolean filteredAreUncalled,
+ final boolean storeAllVcfSources,
+ final int maxSourceFieldLength) {
if ( unsortedVCs == null || unsortedVCs.isEmpty() )
return null;
@@ -1166,7 +1184,7 @@ public static VariantContext simpleMerge(final Collection unsort
longestVC = vc; // get the longest location
nFiltered += vc.isFiltered() ? 1 : 0;
- if ( vc.isVariant() ) variantSources.add(vc.getSource());
+ if ( storeAllVcfSources && vc.isVariant() ) variantSources.add(vc.getSource());
AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc);
@@ -1237,7 +1255,19 @@ public static VariantContext simpleMerge(final Collection unsort
final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs);
- final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID);
+ // This preserves the GATK3-like behavior of reporting multiple sources, delimited with hyphen:
+ // NOTE: if storeAllVcfSources is false, variantSources will be empty and therefore no sorting is performed
+ String allSources = variantSources.isEmpty() ? name : variantSources.stream()
+ .sorted()
+ .distinct()
+ .limit(MAX_SOURCES_TO_INCLUDE)
+ .collect(Collectors.joining("-"));
+
+ if (maxSourceFieldLength != -1 && allSources.length() > maxSourceFieldLength) {
+ allSources = allSources.substring(0, maxSourceFieldLength);
+ }
+
+ final VariantContextBuilder builder = new VariantContextBuilder().source(allSources).id(ID);
builder.loc(longestVC.getContig(), longestVC.getStart(), longestVC.getEnd());
builder.alleles(alleles);
builder.genotypes(genotypes);
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUnitTest.java
index 35864d62f1a..f0c45c8f934 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUnitTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUnitTest.java
@@ -76,7 +76,7 @@ public Object[][] testCreateInvalidCoordinatesData() {
@Test(dataProvider="testCreateInvalidCoordinatesData", expectedExceptions = { IllegalArgumentException.class })
public void testCreateInvalidCoordinates(final String contigA, final int posA, final String contigB, final int posB) {
new SVCallRecord("var1", contigA, posA, true, contigB, posB, false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
- null, Collections.emptyList(), null, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(),
+ null, Collections.emptyList(), null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(),
Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
Assert.fail("Expected exception not thrown");
}
@@ -93,14 +93,14 @@ public Object[][] testCreateValidCoordinatesData() {
@Test(dataProvider="testCreateValidCoordinatesData")
public void testCreateValidCoordinates(final String contigA, final int posA, final String contigB, final int posB) {
new SVCallRecord("var1", contigA, posA, true, contigB, posB, false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
- null, Collections.emptyList(), null, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(),
+ null, Collections.emptyList(), null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(),
Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
}
@Test
public void testGetters() {
final SVCallRecord record = new SVCallRecord("var1", "chr1", 100, true, "chr1", 200, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
- GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Lists.newArrayList(SVCallRecord.ComplexEventInterval.decode("DUP_chr1:100-200", SVTestUtils.hg38Dict)), null, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
+ GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Lists.newArrayList(SVCallRecord.ComplexEventInterval.decode("DUP_chr1:100-200", SVTestUtils.hg38Dict)), null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
GenotypesContext.create(GenotypeBuilder.create("sample1", Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DEL))),
Collections.singletonMap("TEST_KEY", "TEST_VALUE"), Collections.singleton("TEST_FILTER"), Double.valueOf(30), SVTestUtils.hg38Dict);
Assert.assertEquals(record.getId(), "var1");
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtilsUnitTest.java
index 2c814a45576..1ee6107f3bb 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtilsUnitTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtilsUnitTest.java
@@ -69,7 +69,7 @@ public Object[][] testGetVariantBuilderData() {
return new Object[][]{
// DEL
{
- new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000,
+ new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, Collections.emptyList(),
SVTestUtils.DEPTH_ONLY_ALGORITHM_LIST,
ALLELES_DEL,
Lists.newArrayList(GENOTYPE_DEL_1, GENOTYPE_DEL_2), Collections.emptyMap(), Collections.singleton("TEST_FILTER"), Double.valueOf(-3)),
@@ -86,7 +86,7 @@ public Object[][] testGetVariantBuilderData() {
},
// DEL w/ null ref allele
{
- new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000,
+ new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, Collections.emptyList(),
SVTestUtils.DEPTH_ONLY_ALGORITHM_LIST,
Collections.singletonList(Allele.SV_SIMPLE_DEL),
Collections.singletonList(GENOTYPE_DEL_3),
@@ -102,7 +102,7 @@ public Object[][] testGetVariantBuilderData() {
},
// INS
{
- new SVCallRecord("var2", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500,
+ new SVCallRecord("var2", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
ALLELES_INS,
Lists.newArrayList(GENOTYPE_INS_1),
@@ -119,7 +119,7 @@ public Object[][] testGetVariantBuilderData() {
},
// INS, flipped strands
{
- new SVCallRecord("var2", "chr1", 1000, false, "chr1", 1000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500,
+ new SVCallRecord("var2", "chr1", 1000, false, "chr1", 1000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
ALLELES_INS,
Lists.newArrayList(GENOTYPE_INS_1),
@@ -136,7 +136,7 @@ public Object[][] testGetVariantBuilderData() {
},
// INS, undefined length
{
- new SVCallRecord("var2", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), null,
+ new SVCallRecord("var2", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), null, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
ALLELES_INS,
Lists.newArrayList(GENOTYPE_INS_1),
@@ -153,7 +153,7 @@ public Object[][] testGetVariantBuilderData() {
},
// BND
{
- new SVCallRecord("var_bnd", "chr1", 1000, false, "chr2", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null,
+ new SVCallRecord("var_bnd", "chr1", 1000, false, "chr2", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
ALLELES_BND,
Lists.newArrayList(GENOTYPE_BND_1),
@@ -172,7 +172,7 @@ public Object[][] testGetVariantBuilderData() {
},
// CTX
{
- new SVCallRecord("var_ctx", "chr1", 1000, false, "chr2", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, Collections.emptyList(), null,
+ new SVCallRecord("var_ctx", "chr1", 1000, false, "chr2", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, Collections.emptyList(), null, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
ALLELES_CTX,
Lists.newArrayList(GENOTYPE_CTX_1),
@@ -196,6 +196,7 @@ public Object[][] testGetVariantBuilderData() {
GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL,
Lists.newArrayList(SVCallRecord.ComplexEventInterval.decode("DUP_chr1:5000-5100", SVTestUtils.hg38Dict), SVCallRecord.ComplexEventInterval.decode("DEL_chr2:100-200", SVTestUtils.hg38Dict)),
100,
+ Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
ALLELES_CPX,
Lists.newArrayList(GENOTYPE_CPX_1),
@@ -223,7 +224,7 @@ public void testGetVariantBuilder(final SVCallRecord record, final VariantContex
@Test
public void testGetVariantBuilderHasSanitizedNullAttributes() {
- final SVCallRecord record = new SVCallRecord("var3", "chr1", 1000, false, "chr2", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null,
+ final SVCallRecord record = new SVCallRecord("var3", "chr1", 1000, false, "chr2", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
ALLELES_BND,
Lists.newArrayList(GENOTYPE_BND_1),
@@ -300,14 +301,14 @@ public void testFillMissingSamplesWithGenotypes() {
@Test
public void testCopyCallWithNewGenotypes() {
- final SVCallRecord record = new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000,
+ final SVCallRecord record = new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, Collections.emptyList(),
SVTestUtils.DEPTH_ONLY_ALGORITHM_LIST,
ALLELES_DEL,
Lists.newArrayList(GENOTYPE_DEL_1, GENOTYPE_DEL_2),
Collections.singletonMap(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, Collections.singletonList("sample")), Collections.emptySet(), null);
final GenotypesContext genotypes = GenotypesContext.copy(Collections.singletonList(GENOTYPE_DEL_3));
final SVCallRecord result = SVCallRecordUtils.copyCallWithNewGenotypes(record, genotypes);
- final SVCallRecord expected = new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000,
+ final SVCallRecord expected = new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, Collections.emptyList(),
SVTestUtils.DEPTH_ONLY_ALGORITHM_LIST,
ALLELES_DEL,
genotypes,
@@ -449,7 +450,7 @@ public void testConvertInversionsToBreakends() {
Assert.assertNotNull(nonInversionResult.get(0));
SVTestUtils.assertEqualsExceptMembership(nonInversionResult.get(0), nonInversion);
- final SVCallRecord inversion = new SVCallRecord("", "chr1", 1000, true, "chr1", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, Collections.emptyList(), 1000,
+ final SVCallRecord inversion = new SVCallRecord("", "chr1", 1000, true, "chr1", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, Collections.emptyList(), 1000, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
Collections.emptyList(),
Collections.emptyList(),
@@ -524,7 +525,7 @@ public Object[][] testCreateData() {
GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
null, null,
TEST_ATTRIBUTES, -90.),
- new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000,
+ new SVCallRecord("var1", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, Collections.emptyList(),
Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), ALLELES_DEL, Lists.newArrayList(GENOTYPE_DEL_1, GENOTYPE_DEL_2),
TEST_ATTRIBUTES, Collections.emptySet(), -90.)
},
@@ -534,7 +535,7 @@ public Object[][] testCreateData() {
GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
null, null,
TEST_ATTRIBUTES, null),
- new SVCallRecord("var2", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000,
+ new SVCallRecord("var2", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 1000, Collections.emptyList(),
Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), ALLELES_DEL, Lists.newArrayList(GENOTYPE_DEL_1, GENOTYPE_DEL_2),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -543,7 +544,7 @@ public Object[][] testCreateData() {
ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1, GENOTYPE_INS_2), 500, "+-",
GATKSVVCFConstants.StructuralVariantAnnotationType.INS, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
null, null, TEST_ATTRIBUTES, null),
- new SVCallRecord("var3", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500,
+ new SVCallRecord("var3", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1, GENOTYPE_INS_2),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -552,7 +553,7 @@ public Object[][] testCreateData() {
ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1, GENOTYPE_INS_2), 500, "-+",
GATKSVVCFConstants.StructuralVariantAnnotationType.INS, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
null, null, TEST_ATTRIBUTES, null),
- new SVCallRecord("var4", "chr1", 1000, false, "chr1", 1000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500,
+ new SVCallRecord("var4", "chr1", 1000, false, "chr1", 1000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), 500, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1, GENOTYPE_INS_2),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -561,7 +562,7 @@ public Object[][] testCreateData() {
ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1, GENOTYPE_INS_2), -1, "-+",
GATKSVVCFConstants.StructuralVariantAnnotationType.INS, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
null, null, TEST_ATTRIBUTES, null),
- new SVCallRecord("var4b", "chr1", 1000, false, "chr1", 1000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), null,
+ new SVCallRecord("var4b", "chr1", 1000, false, "chr1", 1000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), null, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_INS, Lists.newArrayList(GENOTYPE_INS_1, GENOTYPE_INS_2),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -570,7 +571,7 @@ public Object[][] testCreateData() {
ALLELES_BND, Collections.singletonList(GENOTYPE_BND_1), null, "++",
GATKSVVCFConstants.StructuralVariantAnnotationType.BND, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
"chrX", 2000, TEST_ATTRIBUTES, null),
- new SVCallRecord("var5", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null,
+ new SVCallRecord("var5", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_BND, Collections.singletonList(GENOTYPE_BND_1),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -579,7 +580,7 @@ public Object[][] testCreateData() {
ALLELES_BND, Collections.singletonList(GENOTYPE_BND_1), null, "++",
GATKSVVCFConstants.StructuralVariantAnnotationType.BND, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
"chrX", 2000, TEST_ATTRIBUTES, null),
- new SVCallRecord("var6", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null,
+ new SVCallRecord("var6", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_BND, Collections.singletonList(GENOTYPE_BND_1),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -588,7 +589,7 @@ public Object[][] testCreateData() {
ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1), 250, null,
GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
"chrX", 2000, TEST_ATTRIBUTES_CPX, null),
- new SVCallRecord("var7", "chr1", 1000, null, "chr1", 1000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Collections.emptyList(), 250,
+ new SVCallRecord("var7", "chr1", 1000, null, "chr1", 1000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Collections.emptyList(), 250, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -597,7 +598,7 @@ public Object[][] testCreateData() {
ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1), 250, null,
GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
"chr1", null, TEST_ATTRIBUTES_CPX, null),
- new SVCallRecord("var8", "chr1", 1000, null, "chr1", 2000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Collections.emptyList(), 250,
+ new SVCallRecord("var8", "chr1", 1000, null, "chr1", 2000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Collections.emptyList(), 250, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -606,7 +607,7 @@ public Object[][] testCreateData() {
ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1), 250, null,
GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
null, null, TEST_ATTRIBUTES_CPX, null),
- new SVCallRecord("var9", "chr1", 1000, null, "chr1", 2000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Collections.emptyList(), 250,
+ new SVCallRecord("var9", "chr1", 1000, null, "chr1", 2000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, GATKSVVCFConstants.ComplexVariantSubtype.dDUP, Collections.emptyList(), 250, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -622,6 +623,7 @@ public Object[][] testCreateData() {
GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL,
Lists.newArrayList(SVCallRecord.ComplexEventInterval.decode("DUP_chr1:100-200", SVTestUtils.hg38Dict), SVCallRecord.ComplexEventInterval.decode("DEL_chr2:300-400", SVTestUtils.hg38Dict)),
250,
+ Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_CPX, Collections.singletonList(GENOTYPE_CPX_1),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -630,7 +632,7 @@ public Object[][] testCreateData() {
ALLELES_CTX, Collections.singletonList(GENOTYPE_CTX_1), null, "++",
GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
"chrX", 2000, TEST_ATTRIBUTES_CTX, null),
- new SVCallRecord("var11", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ, Collections.emptyList(), null,
+ new SVCallRecord("var11", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ, Collections.emptyList(), null, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_CTX, Collections.singletonList(GENOTYPE_CTX_1),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
@@ -642,7 +644,7 @@ public Object[][] testCreateData() {
"chrX", 2000,
Map.of("TEST_KEY", "TEST_VAL", GATKSVVCFConstants.CPX_TYPE, "CTX_PP/QQ", GATKSVVCFConstants.CPX_INTERVALS, Collections.emptyList()),
null),
- new SVCallRecord("var12", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ, Collections.emptyList(), null,
+ new SVCallRecord("var12", "chr1", 1000, true, "chrX", 2000, true, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ, Collections.emptyList(), null, Collections.emptyList(),
SVTestUtils.PESR_ONLY_ALGORITHM_LIST, ALLELES_CTX, Collections.singletonList(GENOTYPE_CTX_1),
TEST_ATTRIBUTES, Collections.emptySet(), null)
},
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVTestUtils.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVTestUtils.java
index 5a8e97a05fd..110516ac935 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVTestUtils.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVTestUtils.java
@@ -10,7 +10,10 @@
import org.broadinstitute.hellbender.testutils.VariantContextTestUtils;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.SimpleSVType;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
-import org.broadinstitute.hellbender.tools.sv.cluster.*;
+import org.broadinstitute.hellbender.tools.sv.cluster.CanonicalSVCollapser;
+import org.broadinstitute.hellbender.tools.sv.cluster.CanonicalSVLinkage;
+import org.broadinstitute.hellbender.tools.sv.cluster.ClusteringParameters;
+import org.broadinstitute.hellbender.tools.sv.cluster.SVClusterEngine;
import org.broadinstitute.hellbender.utils.GenomeLoc;
import org.broadinstitute.hellbender.utils.GenomeLocParser;
import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;
@@ -33,7 +36,8 @@ public class SVTestUtils {
new CanonicalSVCollapser(
hg38Reference,
CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE,
- CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END);
+ CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END,
+ CanonicalSVCollapser.FlagFieldLogic.OR);
public static final String PESR_ALGORITHM = "pesr";
public static final List DEPTH_ONLY_ALGORITHM_LIST = Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM);
@@ -121,6 +125,21 @@ public static SVClusterEngine getNewDefaultMaxCliqueEngine() {
.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 0);
public final static SVCallRecord makeRecord(final String id,
+ final String contigA,
+ final int positionA,
+ final Boolean strandA,
+ final String contigB,
+ final int positionB,
+ final Boolean strandB,
+ final GATKSVVCFConstants.StructuralVariantAnnotationType type,
+ final Integer length,
+ final List algorithms,
+ final List alleles,
+ final List genotypeBuilders) {
+ return makeRecordWithEvidenceAndQuality(id, contigA, positionA, strandA, contigB, positionB, strandB, type, length, Collections.emptyList(), algorithms, alleles, genotypeBuilders, null);
+ }
+
+ public final static SVCallRecord makeRecordWithEvidenceAndQuality(final String id,
final String contigA,
final int positionA,
final Boolean strandA,
@@ -129,17 +148,19 @@ public final static SVCallRecord makeRecord(final String id,
final Boolean strandB,
final GATKSVVCFConstants.StructuralVariantAnnotationType type,
final Integer length,
+ final List evidence,
final List algorithms,
final List alleles,
- final List genotypeBuilders) {
+ final List genotypeBuilders,
+ final Double log10PError) {
final Allele refAllele = Allele.create(ReferenceUtils.getRefBaseAtPosition(hg38Reference, contigA, positionA), true);
final List newAlleles = replaceRefAlleles(alleles, refAllele);
final List genotypes = new ArrayList<>(genotypeBuilders.size());
for (final GenotypeBuilder builder : genotypeBuilders) {
genotypes.add(makeGenotypeWithRefAllele(builder, refAllele));
}
- return new SVCallRecord(id, contigA, positionA, strandA, contigB, positionB, strandB, type, null, Collections.emptyList(), length, algorithms,
- newAlleles, genotypes, Collections.emptyMap(), Collections.emptySet(), null, hg38Dict);
+ return new SVCallRecord(id, contigA, positionA, strandA, contigB, positionB, strandB, type, null, Collections.emptyList(), length, evidence, algorithms,
+ newAlleles, genotypes, Collections.emptyMap(), Collections.emptySet(), log10PError, hg38Dict);
}
public static final Genotype makeGenotypeWithRefAllele(final GenotypeBuilder builder, final Allele refAllele) {
@@ -378,16 +399,16 @@ public static SVCallRecord newCallRecordWithAllelesAndSampleName(final String sa
builder = builder.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, copyNumber);
}
return new SVCallRecord("", "chr1", 100, getValidTestStrandA(svtype), "chr1", 199, getValidTestStrandB(svtype),
- svtype, null, Collections.emptyList(), 100, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ svtype, null, Collections.emptyList(), 100, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
variantAlleles,
Collections.singletonList(builder.make()),
Collections.emptyMap(), Collections.emptySet(), null);
}
- public static SVCallRecord newNamedDeletionRecordWithAttributes(final String id, final Map attributes) {
- return new SVCallRecord(id, "chr1", 100, true, "chr1", 199, false,
+ public static SVCallRecord newDeletionRecordWithAttributes(final Map attributes) {
+ return new SVCallRecord("", "chr1", 100, true, "chr1", 199, false,
GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- 100, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ 100, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Collections.emptyList(),
Collections.emptyList(),
attributes, Collections.emptySet(), null);
@@ -398,7 +419,7 @@ public static SVCallRecord newNamedDeletionRecordWithAttributesAndGenotypes(fina
final Map attributes) {
return new SVCallRecord(id, "chr1", 100, true, "chr1", 199, false,
GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- 100, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ 100, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
genotypes,
attributes, Collections.emptySet(), null);
@@ -416,40 +437,40 @@ public static final Map keyValueArraysToMap(final String[] keys,
public static SVCallRecord newCallRecordWithLengthAndType(final Integer length, final GATKSVVCFConstants.StructuralVariantAnnotationType svtype) {
final int positionB = length == null ? 1 : CoordMath.getEnd(1, length);
return new SVCallRecord("", "chr1", 1, getValidTestStrandA(svtype), "chr1", positionB, getValidTestStrandB(svtype),
- svtype, null, Collections.emptyList(), length, PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(),
+ svtype, null, Collections.emptyList(), length, Collections.emptyList(), PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(),
Collections.emptyMap(), Collections.emptySet(), null);
}
public static SVCallRecord newDeletionCallRecordWithIdAndAlgorithms(final String id, final List algorithms) {
return new SVCallRecord(id, "chr1", 1, true, "chr1", 100, false,
- GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 100, algorithms, Collections.emptyList(),
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), 100, Collections.emptyList(), algorithms, Collections.emptyList(),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null);
}
// Note strands and length may not be set properly
public static SVCallRecord newPESRCallRecordWithIntervalAndType(final int start, final int end, final GATKSVVCFConstants.StructuralVariantAnnotationType svtype) {
return new SVCallRecord("", "chr1", start, getValidTestStrandA(svtype), "chr1", end, getValidTestStrandB(svtype),
- svtype, null, Collections.emptyList(), getLength(start, end, svtype), PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(),
+ svtype, null, Collections.emptyList(), getLength(start, end, svtype), Collections.emptyList(), PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null);
}
// Note strands and length may not be set properly
public static SVCallRecord newInsertionWithPositionAndLength(final int start, final int length) {
return new SVCallRecord("", "chr1", start, true, "chr1", start + 1, false,
- GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length, PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(),
+ GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length, Collections.emptyList(), PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null);
}
public static SVCallRecord newDepthCallRecordWithIntervalAndType(final int start, final int end, final GATKSVVCFConstants.StructuralVariantAnnotationType svtype) {
return new SVCallRecord("", "chr1", start, getValidTestStrandA(svtype), "chr1", end, getValidTestStrandB(svtype),
- svtype, null, Collections.emptyList(), getLength(start, end, svtype), DEPTH_ONLY_ALGORITHM_LIST, Collections.emptyList(),
+ svtype, null, Collections.emptyList(), getLength(start, end, svtype), Collections.emptyList(), DEPTH_ONLY_ALGORITHM_LIST, Collections.emptyList(),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null);
}
// Note strands and length may not be set properly
public static SVCallRecord newCallRecordWithContigsIntervalAndType(final String startContig, final int start, final String endContig, final int end, final GATKSVVCFConstants.StructuralVariantAnnotationType svtype) {
return new SVCallRecord("", startContig, start, getValidTestStrandA(svtype), endContig, end, getValidTestStrandB(svtype),
- svtype, null, Collections.emptyList(), getLength(start, end, svtype), PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(),
+ svtype, null, Collections.emptyList(), getLength(start, end, svtype), Collections.emptyList(), PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null);
}
@@ -463,7 +484,7 @@ public static Integer getLength(final int start, final int end, final GATKSVVCFC
}
public static SVCallRecord newBndCallRecordWithStrands(final boolean strandA, final boolean strandB) {
- return new SVCallRecord("", "chr1", 1000, strandA, "chr1", 1000, strandB, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null,
+ return new SVCallRecord("", "chr1", 1000, strandA, "chr1", 1000, strandB, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, Collections.emptyList(),
Collections.singletonList(PESR_ALGORITHM),
Collections.emptyList(),
Collections.emptyList(),
@@ -471,7 +492,7 @@ public static SVCallRecord newBndCallRecordWithStrands(final boolean strandA, fi
}
public static SVCallRecord newCtxCallRecord() {
- return new SVCallRecord("", "chr1", 1000, null, "chr1", 1000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, Collections.emptyList(), null,
+ return new SVCallRecord("", "chr1", 1000, null, "chr1", 1000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, Collections.emptyList(), null, Collections.emptyList(),
Collections.singletonList(PESR_ALGORITHM),
Collections.emptyList(),
Collections.emptyList(),
@@ -479,7 +500,7 @@ public static SVCallRecord newCtxCallRecord() {
}
public static SVCallRecord newCpxCallRecordWithLength(final int length) {
- return new SVCallRecord("", "chr1", 1000, null, "chr1", 1000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, null, Collections.emptyList(), length,
+ return new SVCallRecord("", "chr1", 1000, null, "chr1", 1000, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, null, Collections.emptyList(), length, Collections.emptyList(),
Collections.singletonList(PESR_ALGORITHM),
Collections.emptyList(),
Collections.emptyList(),
@@ -487,7 +508,7 @@ public static SVCallRecord newCpxCallRecordWithLength(final int length) {
}
public static SVCallRecord newCnvCallRecordWithStrands(final Boolean strandA, final Boolean strandB) {
- return new SVCallRecord("", "chr1", 1000, strandA, "chr1", 1999, strandB, GATKSVVCFConstants.StructuralVariantAnnotationType.CNV, null, Collections.emptyList(), 1000,
+ return new SVCallRecord("", "chr1", 1000, strandA, "chr1", 1999, strandB, GATKSVVCFConstants.StructuralVariantAnnotationType.CNV, null, Collections.emptyList(), 1000, Collections.emptyList(),
Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Collections.emptyList(),
Collections.emptyList(),
@@ -495,7 +516,7 @@ public static SVCallRecord newCnvCallRecordWithStrands(final Boolean strandA, fi
}
public static SVCallRecord newCallRecordWithCoordinates(final String id, final String chrA, final int posA, final String chrB, final int posB) {
- return new SVCallRecord(id, chrA, posA, true, chrB, posB, false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null,
+ return new SVCallRecord(id, chrA, posA, true, chrB, posB, false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(), null, Collections.emptyList(),
Collections.singletonList("peser"),
Collections.emptyList(),
Collections.emptyList(),
@@ -503,7 +524,7 @@ public static SVCallRecord newCallRecordWithCoordinates(final String id, final S
}
public static SVCallRecord newCallRecordWithCoordinatesAndType(final String id, final String chrA, final int posA, final String chrB, final int posB, final GATKSVVCFConstants.StructuralVariantAnnotationType type) {
- return new SVCallRecord(id, chrA, posA, true, chrB, posB, false, type, null, Collections.emptyList(), getLength(posA, posB, type),
+ return new SVCallRecord(id, chrA, posA, true, chrB, posB, false, type, null, Collections.emptyList(), getLength(posA, posB, type), Collections.emptyList(),
Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Collections.emptyList(),
Collections.emptyList(),
@@ -511,7 +532,7 @@ public static SVCallRecord newCallRecordWithCoordinatesAndType(final String id,
}
public static SVCallRecord newCallRecordWithAlgorithms(final List algorithms) {
- return new SVCallRecord("", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length,
+ return new SVCallRecord("", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length, Collections.emptyList(),
algorithms,
Collections.emptyList(),
Collections.emptyList(),
@@ -519,7 +540,15 @@ public static SVCallRecord newCallRecordWithAlgorithms(final List algori
}
public static SVCallRecord newCallRecordInsertionWithLength(final Integer length) {
- return new SVCallRecord("", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length,
+ return new SVCallRecord("", "chr1", 1000, true, "chr1", 1000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length, Collections.emptyList(),
+ PESR_ONLY_ALGORITHM_LIST,
+ Collections.emptyList(),
+ Collections.emptyList(),
+ Collections.emptyMap(), Collections.emptySet(), null);
+ }
+
+ public static SVCallRecord newCallRecordInsertionWithLengthAndCoordinates(final String chrA, final int posA, final Integer length) {
+ return new SVCallRecord("", chrA, posA, true, chrA, posA, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(), length, Collections.emptyList(),
PESR_ONLY_ALGORITHM_LIST,
Collections.emptyList(),
Collections.emptyList(),
@@ -597,4 +626,18 @@ public static GenotypeBuilder getDiploidCNVGenotypeBuilder(final String sample,
.attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 2)
.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, copyNumber);
}
+
+ public static Map buildMapFromArrays(final String[] keys, final Object[] values) {
+ if (keys.length != values.length) {
+ throw new TestException("Keys and values have different lengths: " + keys.length + " and " + values.length);
+ }
+ final Map map = new HashMap<>();
+ for (int i = 0; i < keys.length; i++) {
+ if (keys[i] == null) {
+ throw new TestException("Encountered null key");
+ }
+ map.put(keys[i], values[i]);
+ }
+ return map;
+ }
}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/BinnedCNVDefragmenterTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/BinnedCNVDefragmenterTest.java
index 611763ff23c..384df0f0ef2 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/BinnedCNVDefragmenterTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/BinnedCNVDefragmenterTest.java
@@ -9,6 +9,7 @@
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
@@ -95,20 +96,22 @@ public void testGetMaxClusterableStartingPosition() {
public void testAdd() {
//single-sample merge case, ignoring sample sets
final SVClusterEngine temp1 = SVClusterEngineFactory.createBinnedCNVDefragmenter(SVTestUtils.hg38Dict, CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, SVTestUtils.hg38Reference, paddingFraction, 0.8, SVTestUtils.targetIntervals);
- temp1.add(SVTestUtils.call1);
+ final List output1 = new ArrayList<>();
+ output1.addAll(temp1.addAndFlush(SVTestUtils.call1));
//force new cluster by adding a non-overlapping event
- temp1.add(SVTestUtils.call3);
- final List output1 = temp1.forceFlush(); //flushes all clusters
+ output1.addAll(temp1.addAndFlush(SVTestUtils.call3));
+ output1.addAll(temp1.flush()); //flushes all clusters
Assert.assertEquals(output1.size(), 2);
SVTestUtils.assertEqualsExceptMembershipAndGT(SVTestUtils.call1, output1.get(0));
SVTestUtils.assertEqualsExceptMembershipAndGT(SVTestUtils.call3, output1.get(1));
final SVClusterEngine temp2 = SVClusterEngineFactory.createBinnedCNVDefragmenter(SVTestUtils.hg38Dict, CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, SVTestUtils.hg38Reference, paddingFraction, 0.8, SVTestUtils.targetIntervals);
- temp2.add(SVTestUtils.call1);
- temp2.add(SVTestUtils.call2); //should overlap after padding
+ final List output2 = new ArrayList<>();
+ output2.addAll(temp2.addAndFlush(SVTestUtils.call1));
+ output2.addAll(temp2.addAndFlush(SVTestUtils.call2)); //should overlap after padding
//force new cluster by adding a call on another contig
- temp2.add(SVTestUtils.call4_chr10);
- final List output2 = temp2.forceFlush();
+ output2.addAll(temp2.addAndFlush(SVTestUtils.call4_chr10));
+ output2.addAll(temp2.flush());
Assert.assertEquals(output2.size(), 2);
Assert.assertEquals(output2.get(0).getPositionA(), SVTestUtils.call1.getPositionA());
Assert.assertEquals(output2.get(0).getPositionB(), SVTestUtils.call2.getPositionB());
@@ -116,9 +119,10 @@ public void testAdd() {
//cohort case, checking sample set overlap
final SVClusterEngine temp3 = SVClusterEngineFactory.createCNVDefragmenter(SVTestUtils.hg38Dict, CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, SVTestUtils.hg38Reference, CNVLinkage.DEFAULT_PADDING_FRACTION, CNVLinkage.DEFAULT_SAMPLE_OVERLAP);
- temp3.add(SVTestUtils.call1);
- temp3.add(SVTestUtils.sameBoundsSampleMismatch);
- final List output3 = temp3.forceFlush();
+ final List output3 = new ArrayList<>();
+ output3.addAll(temp3.addAndFlush(SVTestUtils.call1));
+ output3.addAll(temp3.addAndFlush(SVTestUtils.sameBoundsSampleMismatch));
+ output3.addAll(temp3.flush());
Assert.assertEquals(output3.size(), 2);
}
}
\ No newline at end of file
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CNVDefragmenterTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CNVDefragmenterTest.java
index 012c302bd83..cb540d4988e 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CNVDefragmenterTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CNVDefragmenterTest.java
@@ -21,47 +21,47 @@ public class CNVDefragmenterTest {
@Test
public void testClusterTogether() {
final SVCallRecord deletion = new SVCallRecord("test_del", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
final SVCallRecord duplication = new SVCallRecord("test_dup", "chr1", 1000, false, "chr1", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, null, Collections.emptyList(),
- 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DUP),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
Assert.assertFalse(defragmenter.areClusterable(deletion, duplication), "Different sv types should not cluster");
final SVCallRecord duplicationNonDepthOnly = new SVCallRecord("test_dup", "chr1", 1000, false, "chr1", 1999, true, GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, null, Collections.emptyList(),
- 1000, Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM, SVTestUtils.PESR_ALGORITHM),
+ 1000, Collections.emptyList(), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM, SVTestUtils.PESR_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DUP),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
Assert.assertFalse(defragmenter.areClusterable(duplication, duplicationNonDepthOnly), "Clustered records must be depth-only");
final SVCallRecord cnv = new SVCallRecord("test_cnv", "chr1", 1000, null, "chr1", 1999, null, GATKSVVCFConstants.StructuralVariantAnnotationType.CNV, null, Collections.emptyList(),
- 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
Assert.assertFalse(defragmenter.areClusterable(deletion, cnv), "Different sv types should not cluster");
final SVCallRecord insertion = new SVCallRecord("test_ins", "chr1", 1000, true, "chr1", 1001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, null, Collections.emptyList(),
- 1000, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
+ 1000, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
Assert.assertFalse(defragmenter.areClusterable(insertion, insertion), "Only CNVs should be valid");
final SVCallRecord deletion2 = new SVCallRecord("test_del2", "chr1", 1000, true, "chr1", 1999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
Assert.assertTrue(defragmenter.areClusterable(deletion, deletion2), "Valid identical records should cluster");
final SVCallRecord deletion3 = new SVCallRecord("test_del3", "chr1", 2999, true, "chr1", 3998, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
Assert.assertTrue(defragmenter.areClusterable(deletion, deletion3), "Should cluster due to overlap");
final SVCallRecord deletion4 = new SVCallRecord("test_del3", "chr1", 3000, true, "chr1", 3999, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
Assert.assertFalse(defragmenter.areClusterable(deletion, deletion4), "Should barely not cluster");
@@ -190,7 +190,7 @@ public Object[][] recordPairs() {
@Test(dataProvider= "maxPositionIntervals")
public void testGetMaxClusterableStartingPosition(final int start, final int end) {
final SVCallRecord call1 = new SVCallRecord("call1", "chr1", start, true, "chr1", end, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- end - start + 1, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ end - start + 1, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
final int maxClusterableStart = defragmenter.getMaxClusterableStartingPosition(call1);
@@ -198,7 +198,7 @@ public void testGetMaxClusterableStartingPosition(final int start, final int end
final int call2Start = maxClusterableStart;
final int call2End = dictionary.getSequence(call1.getContigA()).getSequenceLength();
final SVCallRecord call2 = new SVCallRecord("call2", "chr1", call2Start, true, "chr1", call2End, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- call2End - call2Start + 1, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ call2End - call2Start + 1, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
Assert.assertTrue(defragmenter.areClusterable(call1, call2));
@@ -206,7 +206,7 @@ public void testGetMaxClusterableStartingPosition(final int start, final int end
final int call3Start = maxClusterableStart + 1;
final int call3End = dictionary.getSequence(call1.getContigA()).getSequenceLength();
final SVCallRecord call3 = new SVCallRecord("call3", "chr1", call3Start, true, "chr1", call3End, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- call3End - call3Start + 1, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ call3End - call3Start + 1, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, dictionary);
Assert.assertFalse(defragmenter.areClusterable(call1, call3));
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapserUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapserUnitTest.java
index f46e4a6e7c9..0b83beadb0b 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapserUnitTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapserUnitTest.java
@@ -26,27 +26,43 @@ public class CanonicalSVCollapserUnitTest {
private static final CanonicalSVCollapser collapser = new CanonicalSVCollapser(
SVTestUtils.hg38Reference,
CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE,
- CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END);
+ CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END,
+ CanonicalSVCollapser.FlagFieldLogic.OR);
private static final CanonicalSVCollapser collapserMinMax = new CanonicalSVCollapser(
SVTestUtils.hg38Reference,
CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE,
- CanonicalSVCollapser.BreakpointSummaryStrategy.MIN_START_MAX_END);
+ CanonicalSVCollapser.BreakpointSummaryStrategy.MIN_START_MAX_END,
+ CanonicalSVCollapser.FlagFieldLogic.OR);
private static final CanonicalSVCollapser collapserMaxMin = new CanonicalSVCollapser(
SVTestUtils.hg38Reference,
CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE,
- CanonicalSVCollapser.BreakpointSummaryStrategy.MAX_START_MIN_END);
+ CanonicalSVCollapser.BreakpointSummaryStrategy.MAX_START_MIN_END,
+ CanonicalSVCollapser.FlagFieldLogic.OR);
private static final CanonicalSVCollapser collapserMean = new CanonicalSVCollapser(
SVTestUtils.hg38Reference,
CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE,
- CanonicalSVCollapser.BreakpointSummaryStrategy.MEAN_START_MEAN_END);
+ CanonicalSVCollapser.BreakpointSummaryStrategy.MEAN_START_MEAN_END,
+ CanonicalSVCollapser.FlagFieldLogic.OR);
private static final CanonicalSVCollapser collapserRepresentative = new CanonicalSVCollapser(
SVTestUtils.hg38Reference,
CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE,
- CanonicalSVCollapser.BreakpointSummaryStrategy.REPRESENTATIVE);
+ CanonicalSVCollapser.BreakpointSummaryStrategy.REPRESENTATIVE,
+ CanonicalSVCollapser.FlagFieldLogic.OR);
private static final CanonicalSVCollapser collapserSpecificAltAllele = new CanonicalSVCollapser(
SVTestUtils.hg38Reference,
CanonicalSVCollapser.AltAlleleSummaryStrategy.MOST_SPECIFIC_SUBTYPE,
- CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END);
+ CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END,
+ CanonicalSVCollapser.FlagFieldLogic.OR);
+ private static final CanonicalSVCollapser collapserFlagAnd = new CanonicalSVCollapser(
+ SVTestUtils.hg38Reference,
+ CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE,
+ CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END,
+ CanonicalSVCollapser.FlagFieldLogic.AND);
+ private static final CanonicalSVCollapser collapserFlagAlwaysFalse = new CanonicalSVCollapser(
+ SVTestUtils.hg38Reference,
+ CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE,
+ CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END,
+ CanonicalSVCollapser.FlagFieldLogic.ALWAYS_FALSE);
private static final Allele MEI_INSERTION_ALLELE = Allele.create("");
private static final Allele SVA_INSERTION_ALLELE = Allele.create("");
@@ -630,23 +646,23 @@ public Object[][] collapseSampleGenotypesTestData() {
},
// het preferred over hom ref even with lower gq
{
+ "sample",
+ Lists.newArrayList(
+ Lists.newArrayList(Allele.REF_N, Allele.REF_N),
+ Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS)
+ ),
+ Lists.newArrayList(
+ createGenotypeTestAttributesWithGQ(2, 30),
+ createGenotypeTestAttributesWithGQ(2, 20)
+ ),
+ Allele.REF_N,
+ GenotypeBuilder.create(
"sample",
- Lists.newArrayList(
- Lists.newArrayList(Allele.REF_N, Allele.REF_N),
- Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS)
- ),
- Lists.newArrayList(
- createGenotypeTestAttributesWithGQ(2, 30),
- createGenotypeTestAttributesWithGQ(2, 20)
- ),
- Allele.REF_N,
- GenotypeBuilder.create(
- "sample",
- Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS),
- createGenotypeTestAttributesWithGQ(2, 20)
- )
- },
- // hom var preferred over het
+ Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS),
+ createGenotypeTestAttributesWithGQ(2, 20)
+ )
+ },
+ // het preferred over hom-var
{
"sample",
Lists.newArrayList(
@@ -660,11 +676,11 @@ public Object[][] collapseSampleGenotypesTestData() {
Allele.REF_N,
GenotypeBuilder.create(
"sample",
- Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS),
+ Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS),
createGenotypeTestAttributes(2)
)
},
- // hom-var over het if GQ equal
+ // het over hom-var if GQ equal
{
"sample",
Lists.newArrayList(
@@ -678,11 +694,11 @@ public Object[][] collapseSampleGenotypesTestData() {
Allele.REF_N,
GenotypeBuilder.create(
"sample",
- Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS),
+ Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS),
createGenotypeTestAttributes(2)
)
},
- // het over hom-var if GQ is higher
+ // hom-var over het if GQ is higher
{
"sample",
Lists.newArrayList(
@@ -690,17 +706,17 @@ public Object[][] collapseSampleGenotypesTestData() {
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS)
),
Lists.newArrayList(
- createGenotypeTestAttributesWithGQ(2, 30),
- createGenotypeTestAttributesWithGQ(2, 40)
+ createGenotypeTestAttributesWithGQ(2, 40),
+ createGenotypeTestAttributesWithGQ(2, 30)
),
Allele.REF_N,
GenotypeBuilder.create(
"sample",
- Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS),
+ Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS),
createGenotypeTestAttributesWithGQ(2, 40)
)
},
- // hom var preferred over hom-ref too
+ // het preferred over hom-ref too
{
"sample",
Lists.newArrayList(
@@ -716,7 +732,7 @@ public Object[][] collapseSampleGenotypesTestData() {
Allele.REF_N,
GenotypeBuilder.create(
"sample",
- Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS),
+ Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS),
createGenotypeTestAttributesWithGQ(2, 30)
)
},
@@ -775,7 +791,7 @@ public Object[][] collapseSampleGenotypesTestData() {
Allele.REF_N,
GenotypeBuilder.create(
"sample",
- Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS),
+ Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.SV_SIMPLE_INS),
createGenotypeTestAttributes(3)
)
},
@@ -797,7 +813,7 @@ public Object[][] collapseSampleGenotypesTestData() {
Allele.REF_N,
GenotypeBuilder.create(
"sample",
- Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS),
+ Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.SV_SIMPLE_INS),
createGenotypeTestAttributes(3)
)
},
@@ -825,7 +841,7 @@ public Object[][] collapseSampleGenotypesTestData() {
Allele.REF_N,
GenotypeBuilder.create(
"sample",
- Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS),
+ Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.SV_SIMPLE_INS),
createGenotypeTestAttributes(3)
)
},
@@ -1067,7 +1083,7 @@ public Object[][] collapseSampleGenotypesTestData() {
createGenotypeTestAttributes(1, 1)
)
},
- // Multi-allelic CNV, rare case with equal CNQ take CN!=ECN
+ // Multi-allelic CNV, with no CNQ use copy state closest to ref
{
"sample",
Lists.newArrayList(
@@ -1075,35 +1091,34 @@ public Object[][] collapseSampleGenotypesTestData() {
Collections.singletonList(Allele.NO_CALL)
),
Lists.newArrayList(
- createGenotypeTestAttributesWithCNQ(1, 1, 30),
- createGenotypeTestAttributesWithCNQ(1, 0, 30)
+ createGenotypeTestAttributes(1, 1),
+ createGenotypeTestAttributes(1, 2)
),
Allele.REF_N,
GenotypeBuilder.create(
"sample",
Lists.newArrayList(Allele.NO_CALL),
- createGenotypeTestAttributesWithCNQ(1, 0, 30)
+ createGenotypeTestAttributes(1, 1)
)
},
- // Multi-allelic CNV, haploid dup
+ // Multi-allelic CNV, when CNQ equal use copy state closest to ref
{
"sample",
Lists.newArrayList(
- Collections.singletonList(Allele.NO_CALL),
- Collections.singletonList(Allele.NO_CALL)
+ Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL),
+ Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL)
),
Lists.newArrayList(
- createGenotypeTestAttributes(1, 1),
- createGenotypeTestAttributes(1, 2)
+ createGenotypeTestAttributesWithCNQ(1, 1, 30),
+ createGenotypeTestAttributesWithCNQ(1, 2, 30)
),
Allele.REF_N,
GenotypeBuilder.create(
"sample",
- Lists.newArrayList(Allele.NO_CALL),
- createGenotypeTestAttributes(1, 2)
+ Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL),
+ createGenotypeTestAttributesWithCNQ(1, 1, 30)
)
},
- // Multi-allelic CNV, when CNQ equal use CN!=ECN
{
"sample",
Lists.newArrayList(
@@ -1112,16 +1127,32 @@ public Object[][] collapseSampleGenotypesTestData() {
),
Lists.newArrayList(
createGenotypeTestAttributesWithCNQ(2, 2, 30),
- createGenotypeTestAttributesWithCNQ(2, 1, 30)
+ createGenotypeTestAttributesWithCNQ(2, 3, 30)
),
Allele.REF_N,
GenotypeBuilder.create(
"sample",
Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL),
+ createGenotypeTestAttributesWithCNQ(2, 2, 30)
+ )
+ },
+ {
+ "sample",
+ Lists.newArrayList(
+ Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL),
+ Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL)
+ ),
+ Lists.newArrayList(
+ createGenotypeTestAttributesWithCNQ(2, 2, 30),
createGenotypeTestAttributesWithCNQ(2, 1, 30)
+ ),
+ Allele.REF_N,
+ GenotypeBuilder.create(
+ "sample",
+ Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL),
+ createGenotypeTestAttributesWithCNQ(2, 2, 30)
)
},
- // Multi-allelic CNV, when CNQ equal use CN!=ECN
{
"sample",
Lists.newArrayList(
@@ -1136,7 +1167,41 @@ public Object[][] collapseSampleGenotypesTestData() {
GenotypeBuilder.create(
"sample",
Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL),
+ createGenotypeTestAttributesWithCNQ(2, 2, 30)
+ )
+ },
+ {
+ "sample",
+ Lists.newArrayList(
+ Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL),
+ Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL)
+ ),
+ Lists.newArrayList(
+ createGenotypeTestAttributesWithCNQ(2, 1, 30),
createGenotypeTestAttributesWithCNQ(2, 0, 30)
+ ),
+ Allele.REF_N,
+ GenotypeBuilder.create(
+ "sample",
+ Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL),
+ createGenotypeTestAttributesWithCNQ(2, 1, 30)
+ )
+ },
+ {
+ "sample",
+ Lists.newArrayList(
+ Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL),
+ Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL)
+ ),
+ Lists.newArrayList(
+ createGenotypeTestAttributesWithCNQ(2, 5, 30),
+ createGenotypeTestAttributesWithCNQ(2, 6, 30)
+ ),
+ Allele.REF_N,
+ GenotypeBuilder.create(
+ "sample",
+ Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL),
+ createGenotypeTestAttributesWithCNQ(2, 5, 30)
)
},
// Multi-allelic CNV, conflicting del and dup genotypes determined by CNQ
@@ -1174,6 +1239,58 @@ public Object[][] collapseSampleGenotypesTestData() {
createGenotypeTestAttributesWithCNQ(1, 2, 50)
)
},
+ // DEL prioritized over DUP tiebreaker when same distance from expected copy state
+ {
+ "sample",
+ Lists.newArrayList(
+ Collections.singletonList(Allele.NO_CALL),
+ Collections.singletonList(Allele.NO_CALL)
+ ),
+ Lists.newArrayList(
+ createGenotypeTestAttributesWithCNQ(1, 2, 30),
+ createGenotypeTestAttributesWithCNQ(1, 0, 30)
+ ),
+ Allele.REF_N,
+ GenotypeBuilder.create(
+ "sample",
+ Lists.newArrayList(Allele.NO_CALL),
+ createGenotypeTestAttributesWithCNQ(1, 0, 30)
+ )
+ },
+ {
+ "sample",
+ Lists.newArrayList(
+ Collections.singletonList(Allele.NO_CALL),
+ Collections.singletonList(Allele.NO_CALL)
+ ),
+ Lists.newArrayList(
+ createGenotypeTestAttributesWithCNQ(2, 0, 30),
+ createGenotypeTestAttributesWithCNQ(2, 4, 30)
+ ),
+ Allele.REF_N,
+ GenotypeBuilder.create(
+ "sample",
+ Lists.newArrayList(Allele.NO_CALL),
+ createGenotypeTestAttributesWithCNQ(2, 0, 30)
+ )
+ },
+ {
+ "sample",
+ Lists.newArrayList(
+ Collections.singletonList(Allele.NO_CALL),
+ Collections.singletonList(Allele.NO_CALL)
+ ),
+ Lists.newArrayList(
+ createGenotypeTestAttributesWithCNQ(2, 1, 30),
+ createGenotypeTestAttributesWithCNQ(2, 3, 30)
+ ),
+ Allele.REF_N,
+ GenotypeBuilder.create(
+ "sample",
+ Lists.newArrayList(Allele.NO_CALL),
+ createGenotypeTestAttributesWithCNQ(2, 1, 30)
+ )
+ },
};
}
@@ -1234,15 +1351,160 @@ public void collapseLengthTest(final SVCallRecord record,
Assert.assertEquals(collapser.collapseLength(record, type, record.getPositionA(), record.getPositionB()), expectedLength);
}
- @DataProvider(name = "collapseIdsTestData")
- public Object[][] collapseIdsTestData() {
+
+ @DataProvider(name = "collapseAttributesTestData")
+ public Object[][] collapseAttributesTestData() {
return new Object[][]{
- {Collections.singletonList("var1"), "var1"},
- {Lists.newArrayList("var1", "var2"), "var1"},
- {Lists.newArrayList("var2", "var1"), "var1"},
+ // Empty case
+ {
+ new String[]{}, new Object[]{},
+ new String[]{}, new Object[]{},
+ new String[]{}, new Object[]{},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ // Use representative
+ {
+ new String[]{"TEST_KEY"}, new Object[]{"TEST_VALUE"},
+ new String[]{}, new Object[]{},
+ new String[]{"TEST_KEY"}, new Object[]{"TEST_VALUE"},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ {
+ new String[]{}, new Object[]{},
+ new String[]{"TEST_KEY"}, new Object[]{"TEST_VALUE"},
+ new String[]{}, new Object[]{},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ {
+ new String[]{"TEST_KEY1"}, new Object[]{"TEST_VALUE1"},
+ new String[]{"TEST_KEY1"}, new Object[]{"TEST_VALUE2"},
+ new String[]{"TEST_KEY1"}, new Object[]{"TEST_VALUE1"},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ {
+ new String[]{"TEST_KEY1"}, new Object[]{"TEST_VALUE1"},
+ new String[]{"TEST_KEY1", "TEST_KEY2"}, new Object[]{"TEST_VALUE12", "TEST_VALUE22"},
+ new String[]{"TEST_KEY1"}, new Object[]{"TEST_VALUE1"},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ // Reserved flags OR
+ {
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{}, new Object[]{},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ {
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ {
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ {
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE},
+ new String[]{}, new Object[]{}, // False results in non-assignment, implying false
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ {
+ new String[]{}, new Object[]{},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE},
+ new String[]{}, new Object[]{},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ {
+ new String[]{GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{}, new Object[]{},
+ new String[]{GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ {
+ new String[]{}, new Object[]{},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE, GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE}, new Object[]{Boolean.TRUE, Boolean.TRUE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE, GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE}, new Object[]{Boolean.TRUE, Boolean.TRUE},
+ CanonicalSVCollapser.FlagFieldLogic.OR
+ },
+ // Reserved flags AND
+ {
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{}, new Object[]{},
+ new String[]{}, new Object[]{},
+ CanonicalSVCollapser.FlagFieldLogic.AND
+ },
+ {
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ CanonicalSVCollapser.FlagFieldLogic.AND
+ },
+ {
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE},
+ new String[]{}, new Object[]{},
+ CanonicalSVCollapser.FlagFieldLogic.AND
+ },
+ {
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE},
+ new String[]{}, new Object[]{},
+ CanonicalSVCollapser.FlagFieldLogic.AND
+ },
+ {
+ new String[]{}, new Object[]{},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.FALSE},
+ new String[]{}, new Object[]{},
+ CanonicalSVCollapser.FlagFieldLogic.AND
+ },
+ {
+ new String[]{GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{}, new Object[]{},
+ new String[]{}, new Object[]{},
+ CanonicalSVCollapser.FlagFieldLogic.AND
+ },
+ {
+ new String[]{}, new Object[]{},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE, GATKSVVCFConstants.HIGH_SR_BACKGROUND_ATTRIBUTE}, new Object[]{Boolean.TRUE, Boolean.TRUE},
+ new String[]{}, new Object[]{},
+ CanonicalSVCollapser.FlagFieldLogic.AND
+ },
+ // Reserved flags ALWAYS_FALSE
+ {
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{GATKSVVCFConstants.BOTHSIDES_SUPPORT_ATTRIBUTE}, new Object[]{Boolean.TRUE},
+ new String[]{}, new Object[]{},
+ CanonicalSVCollapser.FlagFieldLogic.ALWAYS_FALSE
+ },
};
}
+ @Test(dataProvider= "collapseAttributesTestData")
+ public void collapseAttributesTest(final String[] representativeKeys, final Object[] representativeValues,
+ final String[] secondKeys, final Object[] secondValues,
+ final String[] expectedKeys, final Object[] expectedValues,
+ final CanonicalSVCollapser.FlagFieldLogic flagLogic) {
+ final Map representativeMap = SVTestUtils.buildMapFromArrays(representativeKeys, representativeValues);
+ final Map secondMap = SVTestUtils.buildMapFromArrays(secondKeys, secondValues);
+ final Map expectedMap = SVTestUtils.buildMapFromArrays(expectedKeys, expectedValues);
+ final SVCallRecord representativeCall = SVTestUtils.newDeletionRecordWithAttributes(representativeMap);
+ final SVCallRecord secondCall = SVTestUtils.newDeletionRecordWithAttributes(secondMap);
+ final Collection collection = Lists.newArrayList(secondCall, representativeCall);
+ final CanonicalSVCollapser testCollapser = new CanonicalSVCollapser(
+ SVTestUtils.hg38Reference,
+ CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE,
+ CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END,
+ flagLogic);
+ final Map result = new HashMap<>(testCollapser.collapseAttributes(representativeCall, collection));
+ // Ignore MEMBERS field
+ result.remove(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY);
+ Assert.assertEquals(result, expectedMap);
+ }
+
@DataProvider(name = "getMostPreciseCallsTestData")
public Object[][] getMostPreciseCallsTestData() {
return new Object[][]{
@@ -1428,32 +1690,205 @@ public void collapseIntervalTest(final String[] contigs, final int[] starts, fin
collapseIntervalTestHelper(collapserMean, svtype, contigs, records, expectedMean);
}
- @Test
- public void collapseIntervalRepresentativeTest() {
+ @DataProvider(name = "collapseIntervalRepresentativeTestData")
+ public Object[][] collapseIntervalRepresentativeTestData() {
+ return new Object[][]{
+ // equal evidence, expect second with more carriers
+ {
+ null,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ false
+ },
+ {
+ 0.,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ false
+ },
+ {
+ null,
+ 0.,
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ false
+ },
+ {
+ null,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.RD, GATKSVVCFConstants.EvidenceTypes.PE, GATKSVVCFConstants.EvidenceTypes.SR},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.RD, GATKSVVCFConstants.EvidenceTypes.PE, GATKSVVCFConstants.EvidenceTypes.SR},
+ false
+ },
+ {
+ -99.,
+ -99.,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR},
+ false
+ },
+ {
+ null,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.BAF},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.BAF},
+ false
+ },
+ // quality based
+ {
+ -99.,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ true
+ },
+ {
+ null,
+ -99.,
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ false
+ },
+ {
+ -10.,
+ -9.,
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ true
+ },
+ {
+ -10.,
+ -9.,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR},
+ true
+ },
+ // SR > PE
+ {
+ -99.,
+ -99.,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR},
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ true
+ },
+ // note quality null = 0
+ {
+ null,
+ 0.,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE},
+ true
+ },
+ {
+ 0.,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.RD},
+ true
+ },
+ {
+ null,
+ 0.,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.BAF},
+ true
+ },
+ {
+ null,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE, GATKSVVCFConstants.EvidenceTypes.RD, GATKSVVCFConstants.EvidenceTypes.BAF},
+ true
+ },
+ {
+ null,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE, GATKSVVCFConstants.EvidenceTypes.RD, GATKSVVCFConstants.EvidenceTypes.BAF},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.SR},
+ false
+ },
+ // PE > others
+ {
+ null,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE},
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ true
+ },
+ {
+ null,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.RD},
+ true
+ },
+ {
+ null,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.PE},
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.RD, GATKSVVCFConstants.EvidenceTypes.BAF},
+ true
+ },
+ // irrelevant evidence, expect second with more carriers
+ {
+ null,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.RD},
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ false
+ },
+ {
+ null,
+ null,
+ new GATKSVVCFConstants.EvidenceTypes[]{GATKSVVCFConstants.EvidenceTypes.BAF},
+ new GATKSVVCFConstants.EvidenceTypes[]{},
+ false
+ },
+ };
+ }
+
+ @Test(dataProvider = "collapseIntervalRepresentativeTestData")
+ public void collapseIntervalRepresentativeTest(final Double log10PErrorA,
+ final Double log10PErrorB,
+ final GATKSVVCFConstants.EvidenceTypes[] evidenceA,
+ final GATKSVVCFConstants.EvidenceTypes[] evidenceB,
+ final boolean expectFirst) {
// Choose second record with more carriers
final List records =
Lists.newArrayList(
- SVTestUtils.makeRecord("record1", "chr1", 1000, true,
+ SVTestUtils.makeRecordWithEvidenceAndQuality("record1", "chr1", 1000, true,
"chr1", 2000, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
- null, Collections.emptyList(), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
+ null, Arrays.asList(evidenceA), Collections.emptyList(), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Lists.newArrayList(
new GenotypeBuilder("sample1", Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL)).attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 2),
new GenotypeBuilder("sample2", Lists.newArrayList(Allele.REF_N, Allele.REF_N)).attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 2)
- )
+ ),
+ log10PErrorA
),
- SVTestUtils.makeRecord("record2", "chr1", 1001, true,
+ SVTestUtils.makeRecordWithEvidenceAndQuality("record2", "chr1", 1001, true,
"chr1", 2001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
- null, Collections.emptyList(), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
+ null, Arrays.asList(evidenceB), Collections.emptyList(), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Lists.newArrayList(
new GenotypeBuilder("sample1", Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL)).attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 2),
new GenotypeBuilder("sample2", Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL)).attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 2)
- )
+ ),
+ log10PErrorB
)
);
final Pair result = collapserRepresentative.collapseInterval(records);
- Assert.assertEquals((int) result.getLeft(), 1001);
- Assert.assertEquals((int) result.getRight(), 2001);
+ if (expectFirst) {
+ Assert.assertEquals((int) result.getLeft(), 1000);
+ Assert.assertEquals((int) result.getRight(), 2000);
+ } else {
+ Assert.assertEquals((int) result.getLeft(), 1001);
+ Assert.assertEquals((int) result.getRight(), 2001);
+ }
+ }
+ @Test
+ public void collapseIntervalRepresentativeByCoordinatesTest() {
// record2 and record3 have the best carrier status, but choose second record which is closer to all others on average
final List records2 =
Lists.newArrayList(
@@ -1483,9 +1918,9 @@ public void collapseIntervalRepresentativeTest() {
)
);
final Pair result2 = collapserRepresentative.collapseInterval(records2);
- Assert.assertEquals((int) result2.getLeft(), 999);
- Assert.assertEquals((int) result2.getRight(), 2000);
- }
+ Assert.assertEquals((int) result2.getLeft(), 999);
+ Assert.assertEquals((int) result2.getRight(), 2000);
+}
@DataProvider(name = "distanceDataProvider")
public Object[][] distanceDataProvider() {
@@ -1583,7 +2018,7 @@ public void testComplexSubtypeAndIntervals() {
GATKSVVCFConstants.StructuralVariantAnnotationType.CPX,
GATKSVVCFConstants.ComplexVariantSubtype.dDUP,
Arrays.asList(SVCallRecord.ComplexEventInterval.decode("DUP_chr1:6000-8000", SVTestUtils.hg38Dict)),
- null, Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
+ null, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
Lists.newArrayList(Allele.REF_N, SVTestUtils.CPX_ALLELE),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final SVCallRecord cpx2 = new SVCallRecord("cpx1", "chr1", 1000, null,
@@ -1591,7 +2026,7 @@ public void testComplexSubtypeAndIntervals() {
GATKSVVCFConstants.StructuralVariantAnnotationType.CPX,
GATKSVVCFConstants.ComplexVariantSubtype.dDUP,
Arrays.asList(SVCallRecord.ComplexEventInterval.decode("DUP_chr1:6000-8000", SVTestUtils.hg38Dict)),
- null, Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
+ null, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
Lists.newArrayList(Allele.REF_N, SVTestUtils.CPX_ALLELE),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final SVCallRecord result = collapser.collapse(new SVClusterEngine.OutputCluster(Lists.newArrayList(cpx1, cpx2)));
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineTest.java
index 3b51938e997..a3f768c26bb 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineTest.java
@@ -1,7 +1,10 @@
package org.broadinstitute.hellbender.tools.sv.cluster;
import com.google.common.collect.Lists;
-import htsjdk.variant.variantcontext.*;
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.Genotype;
+import htsjdk.variant.variantcontext.GenotypeBuilder;
+import htsjdk.variant.variantcontext.GenotypesContext;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils;
@@ -50,7 +53,7 @@ private static Integer inferLength(final String contigA, final int posA, final S
@BeforeTest
public void initializeClusterEngine() {
- engine.add(SVTestUtils.call1);
+ engine.addAndFlush(SVTestUtils.call1);
linkageSizeSimilarity.setDepthOnlyParams(depthOnlyParametersSizeSimilarity);
linkageSizeSimilarity.setMixedParams(mixedParametersSizeSimilarity);
linkageSizeSimilarity.setEvidenceParams(evidenceParametersSizeSimilarity);
@@ -167,12 +170,12 @@ public void testClusterTogetherInvalidInterval() {
// End position beyond contig end after padding
final SVCallRecord deletion1 = new SVCallRecord("test_del", "chr1", 1000, true, "chr1", 248956423 + SVTestUtils.defaultEvidenceParameters.getWindow(), false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
null, Collections.emptyList(),
- null, Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
+ null, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final SVCallRecord deletion2 = new SVCallRecord("test_del", "chr1", 1000, true, "chr1", 248956422, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
null, Collections.emptyList(),
- null, Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
+ null, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
engine.getLinkage().areClusterable(deletion1, deletion2);
@@ -204,7 +207,7 @@ public void testGetMaxClusterableStartingPosition(final int start, final int end
private void testGetMaxClusterableStartingPositionWithAlgorithm(final int start, final int end, final String algorithm) {
final SVCallRecord call1 = new SVCallRecord("call1", "chr1", start, true, "chr1", end, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
null, Collections.emptyList(),
- end - start + 1, Collections.singletonList(algorithm),
+ end - start + 1, Collections.emptyList(), Collections.singletonList(algorithm),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final int maxClusterableStart = engine.getLinkage().getMaxClusterableStartingPosition(call1);
@@ -212,12 +215,12 @@ private void testGetMaxClusterableStartingPositionWithAlgorithm(final int start,
final int call2Start = maxClusterableStart;
final SVCallRecord call2Depth = new SVCallRecord("call2", "chr1", call2Start, true, "chr1", call2Start + call1.getLength() - 1, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
null, Collections.emptyList(),
- call1.getLength(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ call1.getLength(), Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final SVCallRecord call2Pesr = new SVCallRecord("call2", "chr1", call2Start, true, "chr1", call2Start + call1.getLength() - 1, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
null, Collections.emptyList(),
- call1.getLength(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
+ call1.getLength(), Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
Assert.assertTrue(engine.getLinkage().areClusterable(call1, call2Depth) || engine.getLinkage().areClusterable(call1, call2Pesr));
@@ -225,12 +228,12 @@ private void testGetMaxClusterableStartingPositionWithAlgorithm(final int start,
final int call3Start = maxClusterableStart + 1;
final SVCallRecord call3Depth = new SVCallRecord("call2", "chr1", call3Start, true, "chr1", call3Start + call1.getLength() - 1, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
null, Collections.emptyList(),
- call1.getLength(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ call1.getLength(), Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final SVCallRecord call3Pesr = new SVCallRecord("call2", "chr1", call3Start, true, "chr1", call3Start + call1.getLength() - 1, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
null, Collections.emptyList(),
- call1.getLength(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
+ call1.getLength(), Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
Assert.assertFalse(engine.getLinkage().areClusterable(call1, call3Depth) || engine.getLinkage().areClusterable(call1, call3Pesr));
@@ -286,12 +289,12 @@ public Object[][] clusterTogetherVaryPositionsProvider() {
public void testClusterTogetherVaryPositions(final int start1, final int end1, final int start2, final int end2, final boolean result) {
final SVCallRecord call1 = new SVCallRecord("call1", "chr1", start1, true,
"chr1", end1, false,
- GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), end1 - start1 + 1, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), end1 - start1 + 1, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP),
SVTestUtils.threeGenotypes, Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final SVCallRecord call2 = new SVCallRecord("call2", "chr1", start2, true,
"chr1", end2, false,
- GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), end2 - start2 + 1, Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(), end2 - start2 + 1, Collections.emptyList(), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP),
SVTestUtils.threeGenotypes, Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
Assert.assertEquals(engine.getLinkage().areClusterable(call1, call2), result);
@@ -303,12 +306,12 @@ public void testClusterTogetherVaryTypes() {
// Pass in null strands to let them be determined automatically
final SVCallRecord call1 = new SVCallRecord("call1", "chr1", 1000, SVTestUtils.getValidTestStrandA(type1),
"chr1", 2001, SVTestUtils.getValidTestStrandB(type1), type1, null, Collections.emptyList(),
- SVTestUtils.getLength(1000, 2001, type1), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ SVTestUtils.getLength(1000, 2001, type1), Collections.emptyList(), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
for (final GATKSVVCFConstants.StructuralVariantAnnotationType type2 : GATKSVVCFConstants.StructuralVariantAnnotationType.values()) {
final SVCallRecord call2 = new SVCallRecord("call2", "chr1", 1000, SVTestUtils.getValidTestStrandA(type2),
"chr1", 2001, SVTestUtils.getValidTestStrandB(type2), type2, null, Collections.emptyList(),
- SVTestUtils.getLength(1000, 2001, type2), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ SVTestUtils.getLength(1000, 2001, type2), Collections.emptyList(), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
// Should only cluster together if same type, except CNVs
if ((type1 == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV && call2.isSimpleCNV()) ||
@@ -328,13 +331,13 @@ public void testClusterTogetherVaryStrands() {
for (final Boolean strand1B : bools) {
final SVCallRecord call1 = new SVCallRecord("call1", "chr1", 1000, strand1A,
"chr1", 2001, strand1B, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(),
- null, Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ null, Collections.emptyList(), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
for (final Boolean strand2A : bools) {
for (final Boolean strand2B : bools) {
final SVCallRecord call2 = new SVCallRecord("call2", "chr1", 1000, strand2A,
"chr1", 2001, strand2B, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(),
- null, Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ null, Collections.emptyList(), Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
// Should only cluster if strands match
Assert.assertEquals(engine.getLinkage().areClusterable(call1, call2), strand1A == strand2A && strand1B == strand2B);
@@ -353,7 +356,7 @@ public void testClusterTogetherVaryContigs() {
final String contig1B = contigs.get(j);
final SVCallRecord call1 = new SVCallRecord("call1", contig1A, 1000, true,
contig1B, 2001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(),
- null, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
+ null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
for (int k = 0; k < contigs.size(); k++) {
final String contig2A = contigs.get(k);
@@ -361,7 +364,7 @@ public void testClusterTogetherVaryContigs() {
final String contig2B = contigs.get(m);
final SVCallRecord call2 = new SVCallRecord("call2", contig2A, 1000, true,
contig2B, 2001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, Collections.emptyList(),
- null, SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
+ null, Collections.emptyList(), SVTestUtils.PESR_ONLY_ALGORITHM_LIST,
Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
// Should only cluster if contigs match
Assert.assertEquals(engine.getLinkage().areClusterable(call1, call2), contig1A.equals(contig2A) && contig1B.equals(contig2B));
@@ -381,11 +384,11 @@ public void testClusterTogetherVaryAlgorithms() {
for (final List algorithms1 : algorithmsList) {
final SVCallRecord call1 = new SVCallRecord("call1", "chr1", 1000, true,
"chr1", 2001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- 1002, algorithms1, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
+ 1002, Collections.emptyList(), algorithms1, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
for (final List algorithms2 : algorithmsList) {
final SVCallRecord call2 = new SVCallRecord("call2", "chr1", 1000, true,
"chr1", 2001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- 1002, algorithms2, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
+ 1002, Collections.emptyList(), algorithms2, Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
// All combinations should cluster
Assert.assertTrue(engine.getLinkage().areClusterable(call1, call2));
}
@@ -424,6 +427,50 @@ public void testClusterTogetherCNVs() {
Assert.assertFalse(engine.getLinkage().areClusterable(del1, dup1));
}
+ @DataProvider(name = "testMatchCNVNoGTData")
+ public Object[][] testMatchCNVNoGTData() {
+ return new Object[][]{
+ // Empty
+ {0, new int[]{}, new int[]{}, true},
+ // Both equal
+ {0, new int[]{0}, new int[]{0}, true},
+ {1, new int[]{1}, new int[]{1}, true},
+ {2, new int[]{2}, new int[]{2}, true},
+ {2, new int[]{3}, new int[]{3}, true},
+ // Unequal
+ {2, new int[]{1}, new int[]{2}, false},
+ {2, new int[]{2}, new int[]{1}, false},
+ // Equal multiple
+ {2, new int[]{2, 2}, new int[]{2, 2}, true},
+ {2, new int[]{4, 2}, new int[]{4, 2}, true},
+ // Unequal multiple
+ {2, new int[]{2, 2}, new int[]{2, 1}, false},
+ {2, new int[]{0, 2}, new int[]{1, 1}, false},
+ {2, new int[]{3, 2}, new int[]{2, 2}, false},
+ {2, new int[]{6, 2}, new int[]{4, 2}, false},
+ };
+ }
+
+ @Test(dataProvider= "testMatchCNVNoGTData")
+ public void testMatchCNVNoGT(final int ploidy, final int[] copyNumbers1, final int[] copyNumbers2, final boolean expected) {
+ final List alleles = Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_CNV);
+ final GATKSVVCFConstants.StructuralVariantAnnotationType svtype = GATKSVVCFConstants.StructuralVariantAnnotationType.CNV;
+ // Create genotypes with copy number attribute (and no GT)
+ final SVCallRecord recordCN1 = getCNVRecordWithCN(ploidy, alleles, svtype, copyNumbers1, GATKSVVCFConstants.COPY_NUMBER_FORMAT);
+ final SVCallRecord recordCN2 = getCNVRecordWithCN(ploidy, alleles, svtype, copyNumbers2, GATKSVVCFConstants.COPY_NUMBER_FORMAT);
+
+ // With sample overlap
+ final ClusteringParameters depthOnlyParams = ClusteringParameters.createDepthParameters(0.8, 0, 10000000, 1);
+ final CanonicalSVLinkage linkage = new CanonicalSVLinkage<>(SVTestUtils.hg38Dict, false);
+ linkage.setDepthOnlyParams(depthOnlyParams);
+
+ Assert.assertEquals(linkage.areClusterable(recordCN1, recordCN2), expected);
+
+ final SVCallRecord recordRDCN1 = getCNVRecordWithCN(ploidy, alleles, svtype, copyNumbers1, GATKSVVCFConstants.DEPTH_GENOTYPE_COPY_NUMBER_FORMAT);
+ final SVCallRecord recordRDCN2 = getCNVRecordWithCN(ploidy, alleles, svtype, copyNumbers2, GATKSVVCFConstants.DEPTH_GENOTYPE_COPY_NUMBER_FORMAT);
+ Assert.assertEquals(linkage.areClusterable(recordRDCN1, recordRDCN2), expected);
+ }
+
@DataProvider(name = "testClusterTogetherIntervaledComplexData")
public Object[][] testClusterTogetherIntervaledComplexData() {
return new Object[][]{
@@ -510,7 +557,7 @@ public void testClusterTogetherIntervaledComplex(final String contigA, final int
GATKSVVCFConstants.StructuralVariantAnnotationType.CPX,
GATKSVVCFConstants.ComplexVariantSubtype.delINV,
Arrays.asList(SVCallRecord.ComplexEventInterval.decode("DEL_chr1:1100-1500", SVTestUtils.hg38Dict), SVCallRecord.ComplexEventInterval.decode("INV_chr1:1600-1900", SVTestUtils.hg38Dict)),
- 1000, Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
+ 1000, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
Lists.newArrayList(Allele.REF_N, SVTestUtils.CPX_ALLELE),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final Integer length2 = inferLength(contigA, posA, contigB, posB);
@@ -519,7 +566,7 @@ public void testClusterTogetherIntervaledComplex(final String contigA, final int
GATKSVVCFConstants.StructuralVariantAnnotationType.CPX,
subtype,
cpxIntervals,
- length2, Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
+ length2, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
Lists.newArrayList(Allele.REF_N, SVTestUtils.CPX_ALLELE),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
Assert.assertEquals(engine.getLinkage().areClusterable(cpx1, cpx2), expected);
@@ -588,7 +635,7 @@ public void testClusterTogetherInsertedComplex(final String contigA, final int p
GATKSVVCFConstants.StructuralVariantAnnotationType.CPX,
GATKSVVCFConstants.ComplexVariantSubtype.dDUP,
Arrays.asList(new SVCallRecord.ComplexEventInterval(GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, new SimpleInterval("chr1", 6000, 8000))),
- 2000, Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
+ 2000, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
Lists.newArrayList(Allele.REF_N, SVTestUtils.CPX_ALLELE),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final Integer length2 = cpxIntervals.get(0).getInterval().size();
@@ -597,7 +644,7 @@ public void testClusterTogetherInsertedComplex(final String contigA, final int p
GATKSVVCFConstants.StructuralVariantAnnotationType.CPX,
subtype,
cpxIntervals,
- length2, Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
+ length2, Collections.emptyList(), Collections.singletonList(SVTestUtils.PESR_ALGORITHM),
Lists.newArrayList(Allele.REF_N, SVTestUtils.CPX_ALLELE),
Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
Assert.assertEquals(engine.getLinkage().areClusterable(cpx1, cpx2), expected);
@@ -608,10 +655,10 @@ public void testClusterTogetherVaryParameters() {
final SVClusterEngine testEngine1 = SVTestUtils.getNewDefaultSingleLinkageEngine();
final SVCallRecord call1 = new SVCallRecord("call1", "chr1", 1000, true,
"chr1", 2001, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- 1002, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
+ 1002, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final SVCallRecord call2 = new SVCallRecord("call2", "chr1", 1100, true,
"chr1", 2101, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- 1002, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
+ 1002, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
// Cluster with default parameters
Assert.assertTrue(testEngine1.getLinkage().areClusterable(call1, call2));
final ClusteringParameters exactMatchParameters = ClusteringParameters.createDepthParameters(1.0, 0, 0, 1.0);
@@ -650,20 +697,22 @@ public void testAddVaryPositions(final int positionA1, final int positionB1,
}
final SVCallRecord call1 = new SVCallRecord("call1", "chr1", positionA1, true,
"chr1", positionB1, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- positionB1 - positionA1 + 1, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ positionB1 - positionA1 + 1, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final SVCallRecord call2 = new SVCallRecord("call1", "chr1", positionA2, true,
"chr1", positionB2, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- positionB2 - positionA2 + 1, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ positionB2 - positionA2 + 1, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final SVCallRecord call3 = new SVCallRecord("call1", "chr1", positionA3, true,
"chr1", positionB3, false, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Collections.emptyList(),
- positionB3 - positionA3 + 1, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ positionB3 - positionA3 + 1, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
- engine.add(call1);
- engine.add(call2);
- engine.add(call3);
- Assert.assertEquals(engine.forceFlush().size(), result);
+ final List output = new ArrayList<>();
+ output.addAll(engine.addAndFlush(call1));
+ output.addAll(engine.addAndFlush(call2));
+ output.addAll(engine.addAndFlush(call3));
+ output.addAll(engine.flush());
+ Assert.assertEquals(output.size(), result);
}
@Test
@@ -671,22 +720,24 @@ public void testAdd() {
//single-sample merge case, ignoring sample sets
final SVClusterEngine temp1 = SVTestUtils.getNewDefaultSingleLinkageEngine();
Assert.assertTrue(temp1.isEmpty());
- temp1.add(SVTestUtils.call1);
+ final List output1 = new ArrayList<>();
+ output1.addAll(temp1.addAndFlush(SVTestUtils.call1));
Assert.assertFalse(temp1.isEmpty());
//force new cluster by adding a non-overlapping event
- temp1.add(SVTestUtils.call3);
- final List output1 = temp1.forceFlush(); //flushes all clusters
+ output1.addAll(temp1.addAndFlush(SVTestUtils.call3));
+ output1.addAll(temp1.flush()); //flushes all clusters
Assert.assertTrue(temp1.isEmpty());
Assert.assertEquals(output1.size(), 2);
SVTestUtils.assertEqualsExceptMembershipAndGT(SVTestUtils.call1, output1.get(0));
SVTestUtils.assertEqualsExceptMembershipAndGT(SVTestUtils.call3, output1.get(1));
final SVClusterEngine temp2 = SVTestUtils.getNewDefaultSingleLinkageEngine();
- temp2.add(SVTestUtils.call1);
- temp2.add(SVTestUtils.overlapsCall1);
+ final List output2 = new ArrayList<>();
+ output2.addAll(temp2.addAndFlush(SVTestUtils.call1));
+ output2.addAll(temp2.addAndFlush(SVTestUtils.overlapsCall1));
//force new cluster by adding a call on another contig
- temp2.add(SVTestUtils.call4_chr10);
- final List output2 = temp2.forceFlush();
+ output2.addAll(temp2.addAndFlush(SVTestUtils.call4_chr10));
+ output2.addAll(temp2.flush());
Assert.assertEquals(output2.size(), 2);
//median of two items ends up being the second item here
Assert.assertEquals(output2.get(0).getPositionA(), SVTestUtils.call1.getPositionA());
@@ -695,9 +746,10 @@ public void testAdd() {
//checking insensitivity to sample set overlap
final SVClusterEngine temp3 = SVTestUtils.getNewDefaultSingleLinkageEngine();
- temp3.add(SVTestUtils.call1);
- temp3.add(SVTestUtils.sameBoundsSampleMismatch);
- final List output3 = temp3.forceFlush();
+ final List output3 = new ArrayList<>();
+ output3.addAll(temp3.addAndFlush(SVTestUtils.call1));
+ output3.addAll(temp3.addAndFlush(SVTestUtils.sameBoundsSampleMismatch));
+ output3.addAll(temp3.flush());
Assert.assertEquals(output3.size(), 1);
Assert.assertEquals(output3.get(0).getPositionA(), SVTestUtils.call1.getPositionA());
Assert.assertEquals(output3.get(0).getPositionB(), SVTestUtils.call1.getPositionB());
@@ -710,12 +762,13 @@ public void testAddMaxCliqueLarge() {
final int numRecords = 100;
final SVClusterEngine engine = SVTestUtils.getNewDefaultMaxCliqueEngine();
final int length = 5000;
+ final List result = new ArrayList<>();
for (int i = 0; i < numRecords; i++) {
final int start = 1000 + 10 * i;
final int end = start + length - 1;
- engine.add(SVTestUtils.newPESRCallRecordWithIntervalAndType(start, end, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL));
+ result.addAll(engine.addAndFlush(SVTestUtils.newPESRCallRecordWithIntervalAndType(start, end, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
}
- final List result = engine.forceFlush();
+ result.addAll(engine.flush());
Assert.assertEquals(result.size(), 50);
for (final SVCallRecord resultRecord : result) {
Assert.assertTrue(resultRecord.getAttributes().containsKey(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY));
@@ -778,21 +831,14 @@ public void testGetCarrierSamplesBiallelic(final int ploidy, final Allele refAll
}
// Create genotypes with copy number attribute (and no GT)
- final List genotypesWithCopyNumber = IntStream.range(0, copyNumbers.length)
- .mapToObj(i -> new GenotypeBuilder(String.valueOf(i))
- .attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, copyNumbers[i])
- .attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, ploidy)
- .alleles(SVTestUtils.buildHomAlleleListWithPloidy(Allele.NO_CALL, ploidy))
- .make())
- .collect(Collectors.toList());
- final SVCallRecord recordWithCopyNumber = new SVCallRecord("", "chr1", 1000, SVTestUtils.getValidTestStrandA(svtype),
- "chr1", 1999, SVTestUtils.getValidTestStrandB(svtype), svtype, null, Collections.emptyList(),
- 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
- alleles, GenotypesContext.copy(genotypesWithCopyNumber), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
- final Set resultWithCopyNumber = recordWithCopyNumber.getCarrierSampleSet();
-
+ final SVCallRecord recordCN = getCNVRecordWithCN(ploidy, alleles, svtype, copyNumbers, GATKSVVCFConstants.COPY_NUMBER_FORMAT);
+ final Set resultWithCopyNumber = recordCN.getCarrierSampleSet();
Assert.assertEquals(resultWithCopyNumber, expectedResult);
+ final SVCallRecord recordRDCN = getCNVRecordWithCN(ploidy, alleles, svtype, copyNumbers, GATKSVVCFConstants.DEPTH_GENOTYPE_COPY_NUMBER_FORMAT);
+ final Set resultWithRDCopyNumber = recordRDCN.getCarrierSampleSet();
+ Assert.assertEquals(resultWithRDCopyNumber, expectedResult);
+
// Create genotypes with GT (and no copy number attribute)
final List genotypesWithGenotype = IntStream.range(0, copyNumbers.length)
.mapToObj(i -> new GenotypeBuilder(String.valueOf(i))
@@ -802,13 +848,29 @@ public void testGetCarrierSamplesBiallelic(final int ploidy, final Allele refAll
.collect(Collectors.toList());
final SVCallRecord recordWithGenotype = new SVCallRecord("", "chr1", 1000, SVTestUtils.getValidTestStrandA(svtype),
"chr1", 1999, SVTestUtils.getValidTestStrandB(svtype), svtype, null, Collections.emptyList(),
- 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
alleles, GenotypesContext.copy(genotypesWithGenotype), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
final Set resultWithGenotype = recordWithGenotype.getCarrierSampleSet();
Assert.assertEquals(resultWithGenotype, expectedResult);
}
+ private SVCallRecord getCNVRecordWithCN(final int ploidy, List alleles, final GATKSVVCFConstants.StructuralVariantAnnotationType svtype,
+ final int[] copyNumbers, final String cnField) {
+ // Create genotypes with copy number attribute (and no GT)
+ final List genotypesWithCopyNumber = IntStream.range(0, copyNumbers.length)
+ .mapToObj(i -> new GenotypeBuilder(String.valueOf(i))
+ .attribute(cnField, copyNumbers[i])
+ .attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, ploidy)
+ .alleles(SVTestUtils.buildHomAlleleListWithPloidy(Allele.NO_CALL, ploidy))
+ .make())
+ .collect(Collectors.toList());
+ return new SVCallRecord("", "chr1", 1000, SVTestUtils.getValidTestStrandA(svtype),
+ "chr1", 1999, SVTestUtils.getValidTestStrandB(svtype), svtype, null, Collections.emptyList(),
+ 1000, Collections.emptyList(), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM),
+ alleles, GenotypesContext.copy(genotypesWithCopyNumber), Collections.emptyMap(), Collections.emptySet(), null, SVTestUtils.hg38Dict);
+ }
+
@Test
public void testLargeRandom() {
final Random rand = new Random(42);
@@ -819,8 +881,14 @@ public void testLargeRandom() {
records.add(SVTestUtils.newPESRCallRecordWithIntervalAndType(Math.min(pos1, pos2), Math.max(pos1, pos2), GATKSVVCFConstants.StructuralVariantAnnotationType.DEL));
}
final SVClusterEngine engine = SVTestUtils.getNewDefaultMaxCliqueEngine();
- records.stream().sorted(SVCallRecordUtils.getCallComparator(SVTestUtils.hg38Dict)).forEach(engine::add);
- final List output = engine.forceFlush();
+ final List output = new ArrayList<>(
+ records.stream()
+ .sorted(SVCallRecordUtils.getCallComparator(SVTestUtils.hg38Dict))
+ .map(engine::addAndFlush)
+ .flatMap(List::stream)
+ .collect(Collectors.toUnmodifiableList())
+ );
+ output.addAll(engine.flush());
Assert.assertEquals(output.size(), 2926);
}
}
\ No newline at end of file
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStratificationEngineUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStratificationEngineUnitTest.java
new file mode 100644
index 00000000000..c6a7fa9f403
--- /dev/null
+++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/stratify/SVStratificationEngineUnitTest.java
@@ -0,0 +1,554 @@
+package org.broadinstitute.hellbender.tools.sv.stratify;
+
+import com.google.common.collect.Lists;
+import htsjdk.samtools.util.Locatable;
+import org.broadinstitute.hellbender.GATKBaseTest;
+import org.broadinstitute.hellbender.engine.GATKPath;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
+import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
+import org.broadinstitute.hellbender.tools.sv.SVTestUtils;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+public class SVStratificationEngineUnitTest extends GATKBaseTest {
+
+ private static final GATKPath CONFIG_FILE_PATH = new GATKPath(toolsTestDir + "/sv/sv_stratify_config.tsv");
+
+ private static final String CONTEXT_1_NAME = "context1";
+ private static final String CONTEXT_2_NAME = "context2";
+
+ private static final List CONTEXT_1_INTERVALS = Lists.newArrayList(new SimpleInterval("chr1", 1000, 2000));
+ private static final List CONTEXT_2_INTERVALS = Lists.newArrayList(new SimpleInterval("chr2", 1000, 2000));
+
+ private static SVStatificationEngine makeDefaultEngine() {
+ return new SVStatificationEngine(SVTestUtils.hg38Dict);
+ }
+
+ @Test
+ public void testAddContext() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ Assert.assertNotNull(engine.getTrackIntervals(CONTEXT_1_NAME));
+ Assert.assertNull(engine.getTrackIntervals(CONTEXT_2_NAME));
+ }
+
+ @Test(expectedExceptions = IllegalArgumentException.class)
+ public void testAddDuplicateContext() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ engine.addTrack(CONTEXT_1_NAME, CONTEXT_2_INTERVALS);
+ }
+
+ @Test
+ public void testNoContexts() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ Assert.assertTrue(engine.getStrata().isEmpty());
+ }
+
+ @Test
+ public void testAddStratification() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ engine.addStratification("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 500, Collections.singleton(CONTEXT_1_NAME));
+ final Collection stratificationCollection = engine.getStrata();
+ Assert.assertNotNull(stratificationCollection);
+ Assert.assertEquals(stratificationCollection.size(), 1);
+ final SVStatificationEngine.Stratum stratification = stratificationCollection.iterator().next();
+ Assert.assertNotNull(stratification);
+ Assert.assertEquals(stratification.getSvType(), GATKSVVCFConstants.StructuralVariantAnnotationType.DEL);
+ Assert.assertNotNull(stratification.getMinSize());
+ Assert.assertEquals(stratification.getMinSize().intValue(), 50);
+ Assert.assertNotNull(stratification.getMaxSize());
+ Assert.assertEquals(stratification.getMaxSize().intValue(), 500);
+ Assert.assertEquals(stratification.getTrackNames().size(), 1);
+ Assert.assertEquals(stratification.getTrackNames().get(0), CONTEXT_1_NAME);
+ }
+
+ @Test(expectedExceptions = IllegalArgumentException.class)
+ public void testAddStratificationBadMinSize() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addStratification("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, -1, 500, Collections.emptySet());
+ }
+
+ @Test(expectedExceptions = IllegalArgumentException.class)
+ public void testAddStratificationBadMaxSize() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addStratification("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, -1, Collections.emptySet());
+ }
+
+ @Test(expectedExceptions = IllegalArgumentException.class)
+ public void testAddStratificationBadMaxSizeInfinity() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addStratification("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, Integer.MAX_VALUE, Collections.emptySet());
+ }
+
+ @Test(expectedExceptions = IllegalArgumentException.class)
+ public void testAddStratificationMaxEqualToMin() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addStratification("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 50, Collections.emptySet());
+ }
+
+ @Test(expectedExceptions = IllegalArgumentException.class)
+ public void testAddStratificationMaxLessThanMin() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addStratification("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 49, Collections.emptySet());
+ }
+
+ @Test
+ public void testCreate() {
+ final Map> map = new HashMap<>();
+ map.put(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ map.put(CONTEXT_2_NAME, CONTEXT_2_INTERVALS);
+ final SVStatificationEngine engine = SVStatificationEngine.create(map, CONFIG_FILE_PATH, SVTestUtils.hg38Dict);
+ Assert.assertNotNull(engine);
+ Assert.assertNotNull(engine.getTrackIntervals(CONTEXT_1_NAME));
+ Assert.assertEquals(engine.getStrata().size(), 7);
+ }
+
+ @DataProvider(name="testGetMatchVariantsData")
+ public Object[][] testGetMatchVariantsData() {
+ return new Object[][] {
+
+ // DEL
+
+ // Outside context interval
+ { "chr1", 100, "chr1", 200, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null },
+ { "chr1", 2000, "chr1", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null },
+ // Simple match
+ { "chr1", 1100, "chr1", 1200, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ { "chr1", 900, "chr1", 1200, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ { "chr1", 900, "chr1", 1900, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ { "chr1", 1100, "chr1", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ { "chr1", 800, "chr1", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ { "chr1", 999, "chr1", 2001, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ { "chr2", 1100, "chr2", 1200, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ // Wrong contig
+ { "chr3", 1100, "chr3", 1200, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null },
+ // Barely match
+ { "chr1", 1000, "chr1", 3001, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ { "chr1", 2, "chr1", 2000, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ { "chr1", 500, "chr1", 2000, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ // Barely miss overlap threshold
+ { "chr1", 1000, "chr1", 3002, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null },
+ // Barely large enough
+ { "chr1", 1100, "chr1", 1149, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, "DEL_50_5k_both" },
+ // Too small
+ { "chr1", 1100, "chr1", 1148, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null },
+
+ // INV (null context)
+
+ // Right size
+ { "chr1", 1001, "chr1", 2000, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, "INV_gt1kb" },
+ { "chr1", 4001, "chr1", 5000, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, "INV_gt1kb" },
+ { "chr2", 10000, "chr2", 20000, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, "INV_gt1kb" },
+ // Too small
+ { "chr1", 1001, "chr1", 1999, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, null },
+ { "chr1", 100, "chr1", 200, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, null, null },
+
+ // INS
+
+ // In context
+ { "chr1", 1100, "chr1", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, 100, "INS_context1" },
+ // SVLEN should not matter
+ { "chr1", 1100, "chr1", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, 1, "INS_context1" },
+ { "chr1", 1100, "chr1", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, 10000, "INS_context1" },
+ // Out of context
+ { "chr1", 100, "chr1", 100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, 100, null },
+ // Out of size range for context2
+ { "chr2", 1100, "chr2", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, 1000, null },
+ { "chr2", 1100, "chr2", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, 400, null },
+
+ // BND
+
+ // Both ends
+ { "chr1", 1000, "chr1", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, "BND_context1" },
+ { "chr1", 2000, "chr1", 2000, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, "BND_context1" },
+ // One end only
+ { "chr1", 500, "chr1", 900, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, null },
+ { "chr1", 1500, "chr1", 3000, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, null },
+ // No ends
+ { "chr1", 500, "chr1", 3000, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null, null },
+
+ // BND (same as CTX)
+
+ // Both ends
+ { "chr1", 1000, "chr1", 1100, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, "CTX_context1" },
+ { "chr1", 2000, "chr1", 2000, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, "CTX_context1" },
+ // One end only
+ { "chr1", 500, "chr1", 900, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, null },
+ { "chr1", 1500, "chr1", 3000, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, null },
+ // No ends
+ { "chr1", 500, "chr1", 3000, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, null, null },
+ };
+ }
+
+ @Test(dataProvider = "testGetMatchVariantsData")
+ public void testGetMatchVariants(final String chromA, final int posA, final String chromB, final int posB,
+ final GATKSVVCFConstants.StructuralVariantAnnotationType svType,
+ final Integer svlen,
+ final String expectedStratName) {
+ final Map> map = new HashMap<>();
+ map.put(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ map.put(CONTEXT_2_NAME, CONTEXT_2_INTERVALS);
+ final SVStatificationEngine engine = SVStatificationEngine.create(map, CONFIG_FILE_PATH, SVTestUtils.hg38Dict);
+ final SVCallRecord record;
+ if (svType == GATKSVVCFConstants.StructuralVariantAnnotationType.INS) {
+ record = SVTestUtils.newCallRecordInsertionWithLengthAndCoordinates(chromA, posA, svlen);
+ } else {
+ record = SVTestUtils.newCallRecordWithCoordinatesAndType("record", chromA, posA, chromB, posB, svType);
+ }
+ final Collection result = engine.getMatches(record, 0.5, 0, 2);
+ if (expectedStratName == null) {
+ Assert.assertTrue(result.isEmpty());
+ } else {
+ Assert.assertFalse(result.isEmpty());
+ Assert.assertEquals(result.iterator().next().getName(), expectedStratName);
+ }
+ }
+
+ // Not supported
+ @Test(expectedExceptions = GATKException.class)
+ public void testGetMatchVariantsCpx() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ engine.addTrack("context3", Lists.newArrayList(new SimpleInterval("chr1", 1500, 2500)));
+ engine.addStratification("strat1", GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, 50, 500, Collections.singleton("context1"));
+ engine.addStratification("strat2", GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, 50, 500, Collections.singleton("context3"));
+ final SVCallRecord record = SVTestUtils.newCallRecordWithCoordinatesAndType("record", "chr1", 1800, "chr1", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX);
+ // Should throw error
+ engine.getMatches(record, 0.5, 0, 2);
+ }
+
+ @Test
+ public void testGetMatchVariantsMultiple() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ engine.addTrack("context3", Lists.newArrayList(new SimpleInterval("chr1", 1500, 2500)));
+ engine.addStratification("strat1", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 500, Collections.singleton("context1"));
+ engine.addStratification("strat2", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 500, Collections.singleton("context3"));
+ final SVCallRecord record = SVTestUtils.newCallRecordWithCoordinatesAndType("record", "chr1", 1800, "chr1", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL);
+ final Collection result = engine.getMatches(record, 0.5, 0, 2);
+ final List names = result.stream().map(SVStatificationEngine.Stratum::getName).collect(Collectors.toList());
+ Assert.assertTrue(names.contains("strat1"));
+ Assert.assertTrue(names.contains("strat2"));
+ }
+
+ @Test
+ public void testGetMatchVariantsNullContexts() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ engine.addStratification("strat1", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 500, Collections.emptySet());
+ final SVCallRecord record = SVTestUtils.newCallRecordWithCoordinatesAndType("record", "chr2", 1800, "chr2", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL);
+ final Collection result = engine.getMatches(record, 0.5, 0, 2);
+ final List names = result.stream().map(SVStatificationEngine.Stratum::getName).collect(Collectors.toList());
+ Assert.assertEquals(names.size(), 1);
+ Assert.assertEquals(names.get(0), "strat1");
+ }
+
+ @Test
+ public void testGetMatchVariantsNoEngineContexts() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addStratification("strat1", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 500, Collections.emptySet());
+ final SVCallRecord record = SVTestUtils.newCallRecordWithCoordinatesAndType("record", "chr2", 1800, "chr2", 2100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL);
+ final Collection result = engine.getMatches(record, 0.5, 0, 2);
+ final List names = result.stream().map(SVStatificationEngine.Stratum::getName).collect(Collectors.toList());
+ Assert.assertEquals(names.size(), 1);
+ Assert.assertEquals(names.get(0), "strat1");
+ }
+
+ @Test
+ public void testTestAddStratificationInnerClass() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ final SVStatificationEngine.Stratum stratification = engine.new Stratum("strat", GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, 50, 500, Collections.singleton(CONTEXT_1_NAME));
+ engine.addStratification(stratification);
+ final Collection stratificationCollection = engine.getStrata();
+ Assert.assertNotNull(stratificationCollection);
+ Assert.assertEquals(stratificationCollection.size(), 1);
+ final SVStatificationEngine.Stratum stratificationOut = stratificationCollection.iterator().next();
+ Assert.assertNotNull(stratificationOut);
+ Assert.assertEquals(stratificationOut.getSvType(), GATKSVVCFConstants.StructuralVariantAnnotationType.DEL);
+ Assert.assertNotNull(stratificationOut.getMinSize());
+ Assert.assertEquals(stratificationOut.getMinSize().intValue(), 50);
+ Assert.assertNotNull(stratificationOut.getMaxSize());
+ Assert.assertEquals(stratificationOut.getMaxSize().intValue(), 500);
+ Assert.assertEquals(stratificationOut.getTrackNames().size(), 1);
+ Assert.assertEquals(stratificationOut.getTrackNames().get(0), CONTEXT_1_NAME);
+ }
+
+ @Test
+ public void testMatchesType() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
+ 100, 500,
+ Collections.emptySet()
+ );
+ Assert.assertTrue(strat.matchesType(SVTestUtils.newCallRecordWithLengthAndType(null, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ Assert.assertFalse(strat.matchesType(SVTestUtils.newCallRecordWithLengthAndType(null, GATKSVVCFConstants.StructuralVariantAnnotationType.DUP)));
+ }
+
+ @Test
+ public void testMatchesSizeSimple() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
+ 100, 500,
+ Collections.emptySet()
+ );
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(499, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(50, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(500, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ }
+
+ @Test
+ public void testMatchesSizeNoMin() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
+ null, 500,
+ Collections.emptySet()
+ );
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(499, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(1, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(500, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ }
+
+ @Test
+ public void testMatchesSizeNoMax() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
+ 50, null,
+ Collections.emptySet()
+ );
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(100, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(49, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(Integer.MAX_VALUE - 1, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ }
+
+ @Test
+ public void testMatchesSizeNoMinOrMax() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
+ null, null,
+ Collections.emptySet()
+ );
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(1, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(Integer.MAX_VALUE - 1, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)));
+ }
+
+ @Test
+ public void testMatchesSizeInsertion() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.INS,
+ 100, 500,
+ Collections.emptySet()
+ );
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(100, GATKSVVCFConstants.StructuralVariantAnnotationType.INS)));
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(499, GATKSVVCFConstants.StructuralVariantAnnotationType.INS)));
+ Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(50, GATKSVVCFConstants.StructuralVariantAnnotationType.INS)));
+ Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(500, GATKSVVCFConstants.StructuralVariantAnnotationType.INS)));
+ }
+
+ @Test
+ public void testMatchesSizeInsertionNullLength() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.INS,
+ 0, Integer.MAX_VALUE - 1,
+ Collections.emptySet()
+ );
+ Assert.assertFalse(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(null, GATKSVVCFConstants.StructuralVariantAnnotationType.INS)));
+ }
+
+ @Test
+ public void testMatchesSizeInsertionNullLength2() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.INS,
+ null, null,
+ Collections.emptySet()
+ );
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newCallRecordWithLengthAndType(null, GATKSVVCFConstants.StructuralVariantAnnotationType.INS)));
+ }
+
+ @Test
+ public void testMatchesSizeBnd() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
+ null, null,
+ Collections.emptySet()
+ );
+ Assert.assertTrue(strat.matchesSize(SVTestUtils.newBndCallRecordWithStrands(true, false)));
+ }
+
+
+ @DataProvider(name="testMatchesContextDelData")
+ public Object[][] testMatchesContextDelData() {
+ return new Object[][] {
+ // Outside context interval
+ { "chr1", 1000, 1500, 0.5, 0, true },
+ { "chr1", 500, 1500, 0.5, 0, true },
+ { "chr1", 499, 1499, 0.5, 0, false },
+ { "chr1", 900, 1300, 0.5, 1, true },
+ { "chr1", 1999, 2000000, 0, 1, true },
+ { "chr1", 500, 600, 0, 2, false },
+ { "chr1", 500, 1100, 0, 2, false },
+ { "chr1", 1100, 1200, 0, 2, true },
+ { "chr1", 1100, 1200, 1, 2, true }
+ };
+ }
+
+ @Test(dataProvider = "testMatchesContextDelData")
+ public void testMatchesContextDel(final String chrom, final int start, final int end,
+ final double overlapFraction, final int numBreakpointOverlaps,
+ final boolean expected) {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
+ null, null,
+ Collections.singleton(CONTEXT_1_NAME)
+ );
+ Assert.assertEquals(strat.matchesTracks(SVTestUtils.newCallRecordWithCoordinatesAndType("record", chrom, start, chrom, end, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL),
+ overlapFraction, numBreakpointOverlaps, 1), expected);
+ }
+
+ @DataProvider(name="testMatchesContextInsData")
+ public Object[][] testMatchesContextInsData() {
+ return new Object[][] {
+ // Outside context interval
+ { "chr1", 1100, 100, 0.1, 0, true },
+ { "chr1", 1100, 100000, 0.1, 0, true },
+ { "chr1", 999, 100, 0.1, 0, false }
+ };
+ }
+
+ @Test(dataProvider = "testMatchesContextInsData")
+ public void testMatchesContextIns(final String chrom, final int start, final int length,
+ final double overlapFraction, final int numBreakpointOverlaps,
+ final boolean expected) {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
+ null, null,
+ Collections.singleton(CONTEXT_1_NAME)
+ );
+ Assert.assertEquals(strat.matchesTracks(SVTestUtils.newCallRecordInsertionWithLengthAndCoordinates(chrom, start, length),
+ overlapFraction, numBreakpointOverlaps, 1), expected);
+ }
+
+ @DataProvider(name="testMatchesContextBndData")
+ public Object[][] testMatchesContextBndData() {
+ return new Object[][] {
+ { "chr1", 999, "chr1", 2001, 1, false },
+ { "chr1", 1000, "chr1", 1200, 1, true },
+ { "chr1", 1000, "chr1", 50000, 1, true },
+ { "chr1", 1000, "chr1", 1000, 1, true },
+ { "chr1", 500, "chr1", 1000, 1, true },
+ { "chr1", 1000, "chr1", 1999, 2, true },
+ { "chr1", 1000, "chr1", 2000, 2, true },
+ { "chr1", 1000, "chr2", 1000, 2, false },
+ { "chr1", 1000, "chr1", 2001, 2, false },
+ { "chr1", 999, "chr1", 1000, 2, false }
+ };
+ }
+
+ @Test(dataProvider = "testMatchesContextBndData")
+ public void testMatchesContextBnd(final String chromA, final int posA, final String chromB, final int posB,
+ final int numBreakpointOverlapsInterchrom, final boolean expected) {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
+ null, null,
+ Collections.singleton(CONTEXT_1_NAME)
+ );
+ Assert.assertEquals(strat.matchesTracks(SVTestUtils.newCallRecordWithCoordinatesAndType("record", chromA, posA, chromB, posB, GATKSVVCFConstants.StructuralVariantAnnotationType.BND),
+ 0.5, 2, numBreakpointOverlapsInterchrom), expected);
+ }
+
+ @DataProvider(name="testCountAnyContextOverlapData")
+ public Object[][] testCountAnyContextOverlapData() {
+ return new Object[][] {
+ { "chr1", 500, 1500, 1 },
+ { "chr1", 1000, 2000, 1 },
+ { "chr1", 1500, 2500, 1 },
+ { "chr1", 500, 2500, 1 },
+ { "chr1", 1100, 1900, 1 },
+ { "chr1", 999, 999, 0 },
+ { "chr1", 999, 1000, 1 },
+ { "chr1", 1000, 1000, 1 },
+ { "chr1", 1000, 1001, 1 },
+ { "chr2", 1000, 1001, 0 },
+ { "chr1", 1999, 2000, 1 },
+ { "chr1", 2000, 2000, 1 },
+ { "chr1", 2001, 2001, 0 }
+ };
+ }
+
+ @Test(dataProvider = "testCountAnyContextOverlapData")
+ public void testCountAnyContextOverlap(final String chrom, final int start, final int end, final int expected) {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
+ null, null,
+ Collections.singleton(CONTEXT_1_NAME)
+ );
+ Assert.assertEquals(strat.countAnyTrackOverlap(new SimpleInterval(chrom, start, end)), expected);
+ }
+
+ @DataProvider(name="testIsMutuallyExclusiveData")
+ public Object[][] testIsMutuallyExclusiveData() {
+ return new Object[][] {
+ {GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null, null, null,
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, null, null, null, null,
+ false},
+ };
+ }
+
+ @Test
+ public void testGetters() {
+ final SVStatificationEngine engine = makeDefaultEngine();
+ engine.addTrack(CONTEXT_1_NAME, CONTEXT_1_INTERVALS);
+ final SVStatificationEngine.Stratum strat = engine.new Stratum(
+ "strat",
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
+ 50, 500,
+ Collections.singleton(CONTEXT_1_NAME)
+ );
+ Assert.assertEquals(strat.getTrackNames().size(), 1);
+ Assert.assertEquals(strat.getTrackNames().get(0), CONTEXT_1_NAME);
+ Assert.assertEquals(strat.getSvType(), GATKSVVCFConstants.StructuralVariantAnnotationType.DEL);
+ Assert.assertEquals(strat.getMinSize(), Integer.valueOf(50));
+ Assert.assertEquals(strat.getMaxSize(), Integer.valueOf(500));
+ Assert.assertEquals(strat.getName(), "strat");
+ }
+}
\ No newline at end of file
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVClusterIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVClusterIntegrationTest.java
new file mode 100644
index 00000000000..734423bdd65
--- /dev/null
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVClusterIntegrationTest.java
@@ -0,0 +1,112 @@
+package org.broadinstitute.hellbender.tools.walkers.sv;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.StructuralVariantType;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.vcf.VCFHeader;
+import org.apache.commons.lang3.tuple.Pair;
+import org.broadinstitute.hellbender.CommandLineProgramTest;
+import org.broadinstitute.hellbender.GATKBaseTest;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
+import org.broadinstitute.hellbender.testutils.VariantContextTestUtils;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
+import org.broadinstitute.hellbender.tools.sv.stratify.SVStratificationEngineArgumentsCollection;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+import java.io.File;
+import java.util.List;
+
+public class GroupedSVClusterIntegrationTest extends CommandLineProgramTest {
+
+ @Test
+ public void testClusterStratified() {
+ final File output = createTempFile("single_linkage_cluster", ".vcf");
+
+ final String clusteringConfigFile = getToolTestDataDir() + "stratified_cluster_params.tsv";
+ final String stratifyConfigFile = getToolTestDataDir() + "stratified_cluster_strata.tsv";
+ final String segdupFile = getToolTestDataDir() + "../SVStratify/hg38.SegDup.chr22.bed";
+ final String segdupName = "SD";
+ final String repeatmaskerFile = getToolTestDataDir() + "../SVStratify/hg38.RM.chr22_subsampled.bed";
+ final String repeatmaskerName = "RM";
+
+ final ArgumentsBuilder args = new ArgumentsBuilder()
+ .addOutput(output)
+ .addVCF(getToolTestDataDir() + "../SVStratify/bwa_melt.chr22.vcf.gz")
+ .add(SVCluster.PLOIDY_TABLE_LONG_NAME, getToolTestDataDir() + "../SVCluster/1kgp.batch1.ploidy.tsv")
+ .add(SVCluster.VARIANT_PREFIX_LONG_NAME, "SVx")
+ .add(SVCluster.ALGORITHM_LONG_NAME, SVCluster.CLUSTER_ALGORITHM.SINGLE_LINKAGE)
+ .add(GroupedSVCluster.CLUSTERING_CONFIG_FILE_LONG_NAME, clusteringConfigFile)
+ .add(SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME, stratifyConfigFile)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, segdupName)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, segdupFile)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, repeatmaskerName)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, repeatmaskerFile)
+ .add(SVStratificationEngineArgumentsCollection.OVERLAP_FRACTION_LONG_NAME, 0.5)
+ .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, GATKBaseTest.hg38Reference);
+
+ runCommandLine(args, GroupedSVCluster.class.getSimpleName());
+
+ final Pair> vcf = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath());
+ final List records = vcf.getValue();
+
+ Assert.assertEquals(records.size(), 1437);
+
+ // Check for specific records
+ int expectedRecordsFound = 0;
+ for (final VariantContext variant : records) {
+ Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY));
+ Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.STRATUM_INFO_KEY));
+ Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE));
+ if (variant.getID().equals("SVx00000032")) {
+ expectedRecordsFound++;
+ Assert.assertEquals(variant.getContig(), "chr22");
+ Assert.assertEquals(variant.getStart(), 11628747);
+ Assert.assertEquals(variant.getEnd(), 11629803);
+ final List algorithms = variant.getAttributeAsStringList(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, null);
+ Assert.assertEquals(algorithms.size(), 2);
+ Assert.assertTrue(algorithms.contains("manta"));
+ Assert.assertTrue(algorithms.contains("wham"));
+ final List members = variant.getAttributeAsStringList(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, null);
+ Assert.assertEquals(members.size(), 2);
+ final List alts = variant.getAlternateAlleles();
+ Assert.assertEquals(alts.size(), 1);
+ Assert.assertEquals(alts.get(0), Allele.SV_SIMPLE_DEL);
+ Assert.assertEquals(variant.getStructuralVariantType(), StructuralVariantType.DEL);
+ Assert.assertEquals(variant.getAttribute(GATKSVVCFConstants.STRATUM_INFO_KEY), "DEL_50_5k_SD_RM");
+ } else if (variant.getID().equals("SVx00000125")) {
+ expectedRecordsFound++;
+ Assert.assertEquals(variant.getContig(), "chr22");
+ Assert.assertEquals(variant.getStart(), 22563654);
+ Assert.assertEquals(variant.getEnd(), 22567049);
+ final List algorithms = variant.getAttributeAsStringList(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, null);
+ Assert.assertEquals(algorithms.size(), 1);
+ Assert.assertTrue(algorithms.contains("manta"));
+ final List members = variant.getAttributeAsStringList(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, null);
+ Assert.assertEquals(members.size(), 1);
+ final List alts = variant.getAlternateAlleles();
+ Assert.assertEquals(alts.size(), 1);
+ Assert.assertEquals(alts.get(0), Allele.SV_SIMPLE_DEL);
+ Assert.assertEquals(variant.getStructuralVariantType(), StructuralVariantType.DEL);
+ Assert.assertEquals(variant.getAttribute(GATKSVVCFConstants.STRATUM_INFO_KEY), SVStratify.DEFAULT_STRATUM);
+ } else if (variant.getID().equals("SVx000001dc")) {
+ expectedRecordsFound++;
+ Assert.assertEquals(variant.getContig(), "chr22");
+ Assert.assertEquals(variant.getStart(), 26060912);
+ Assert.assertEquals(variant.getEnd(), 26060989);
+ final List algorithms = variant.getAttributeAsStringList(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, null);
+ Assert.assertEquals(algorithms.size(), 1);
+ Assert.assertTrue(algorithms.contains("manta"));
+ final List members = variant.getAttributeAsStringList(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, null);
+ Assert.assertEquals(members.size(), 1);
+ final List alts = variant.getAlternateAlleles();
+ Assert.assertEquals(alts.size(), 1);
+ Assert.assertEquals(alts.get(0), Allele.SV_SIMPLE_DUP);
+ Assert.assertEquals(variant.getStructuralVariantType(), StructuralVariantType.DUP);
+ Assert.assertEquals(variant.getAttribute(GATKSVVCFConstants.STRATUM_INFO_KEY), SVStratify.DEFAULT_STRATUM);
+ }
+ }
+ Assert.assertEquals(expectedRecordsFound, 3);
+ }
+}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentationIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentationIntegrationTest.java
index 8f27d2a6389..136e0c36ff0 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentationIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentationIntegrationTest.java
@@ -234,7 +234,6 @@ public void testOverlappingEvents(final List inputVcfs) {
//in NA11829 variant events are not overlapping, so there should be a CN2 homRef in between
final List samplesWithOverlaps = Arrays.asList("HG00365", "HG01789", "HG02221", "NA07357", "NA12005", "NA12873", "NA18997", "NA19428", "NA21120");
- final List samplesWithGaps = Arrays.asList("NA11829");
//all of these samples have an event that overlaps the next event, which is not called in that sample
boolean sawVariant;
@@ -254,18 +253,18 @@ public void testOverlappingEvents(final List inputVcfs) {
}
//these samples have a variant that doesn't overlap the next call
- for (final String sample : samplesWithGaps) {
- sawVariant = false;
- for (final VariantContext vc : overlappingEvents.getRight()) {
- if (!sawVariant && !vc.getGenotype(sample).isHomRef()) {
- sawVariant = true;
- continue;
- }
- if (sawVariant) {
- Assert.assertTrue(vc.getGenotype(sample).isHomRef()
- && (Integer.parseInt(vc.getGenotype(sample).getExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT).toString()) == 2));
- break;
- }
+ sawVariant = false;
+ for (final VariantContext vc : overlappingEvents.getRight()) {
+ final Genotype genotype = vc.getGenotype("NA11829");
+ if (!sawVariant && !genotype.isHomRef()) {
+ sawVariant = true;
+ continue;
+ }
+ if (sawVariant && vc.getEnd() == 23236095) {
+ // Smaller variant nested inside larger hom-var DEL: hom-ref genotype but CN is 0 since it overlaps
+ Assert.assertTrue(genotype.isHomRef()
+ && (Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT).toString()) == 0));
+ break;
}
}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVClusterIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVClusterIntegrationTest.java
index caff0aa9675..75016e8744b 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVClusterIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVClusterIntegrationTest.java
@@ -23,6 +23,7 @@
import org.testng.annotations.Test;
import java.io.File;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
@@ -43,7 +44,7 @@ public void testDefragmentation() {
.addVCF(inputVcfPath)
.add(SVCluster.PLOIDY_TABLE_LONG_NAME, getToolTestDataDir() + "1kgp.batch1.ploidy.tsv")
.add(SVCluster.VARIANT_PREFIX_LONG_NAME, "SVx")
- .add(SVCluster.ALGORITHM_LONG_NAME, SVCluster.CLUSTER_ALGORITHM.DEFRAGMENT_CNV)
+ .add(SVCluster.ALGORITHM_LONG_NAME, SVClusterWalker.CLUSTER_ALGORITHM.DEFRAGMENT_CNV)
.add(SVCluster.DEFRAG_PADDING_FRACTION_LONG_NAME, 0.25)
.add(SVClusterEngineArgumentsCollection.DEPTH_SAMPLE_OVERLAP_FRACTION_NAME, 0.5);
@@ -293,17 +294,23 @@ public void testAgainstSimpleImplementation() {
mixedParameters,
pesrParameters);
- vcfInputFilenames.stream()
- .flatMap(vcfFilename -> VariantContextTestUtils.readEntireVCFIntoMemory(getToolTestDataDir() + vcfFilename).getValue().stream())
- .sorted(IntervalUtils.getDictionaryOrderComparator(referenceSequenceFile.getSequenceDictionary()))
- .map(v -> SVCallRecordUtils.create(v, SVTestUtils.hg38Dict))
- .forEach(engine::add);
+ final List expectedRecords = new ArrayList<>();
+ expectedRecords.addAll(
+ vcfInputFilenames.stream()
+ .flatMap(vcfFilename -> VariantContextTestUtils.readEntireVCFIntoMemory(getToolTestDataDir() + vcfFilename).getValue().stream())
+ .sorted(IntervalUtils.getDictionaryOrderComparator(referenceSequenceFile.getSequenceDictionary()))
+ .map(v -> SVCallRecordUtils.create(v, SVTestUtils.hg38Dict))
+ .map(engine::addAndFlush)
+ .flatMap(List::stream)
+ .collect(Collectors.toList())
+ );
+ expectedRecords.addAll(engine.flush());
- final Comparator recordComparator = SVCallRecordUtils.getCallComparator(referenceSequenceFile.getSequenceDictionary());
- final List expectedVariants = engine.forceFlush().stream()
- .sorted(recordComparator)
+ final Comparator recordComparator = testVcf.getLeft().getVCFRecordComparator();
+ final List expectedVariants = expectedRecords.stream()
.map(SVCallRecordUtils::getVariantBuilder)
.map(VariantContextBuilder::make)
+ .sorted(recordComparator)
.collect(Collectors.toList());
final List testVariants = testVcf.getValue();
@@ -453,11 +460,11 @@ public void testClusterSampleOverlap() {
final int nonRefGenotypeCount = (int) variant.getGenotypes().stream().filter(g -> SVCallRecordUtils.isAltGenotype(g)).count();
Assert.assertEquals(nonRefGenotypeCount, 71);
final int alleleCount = (int) variant.getGenotypes().stream().flatMap(g -> g.getAlleles().stream()).filter(SVCallRecordUtils::isAltAllele).count();
- Assert.assertEquals(alleleCount, 94);
+ Assert.assertEquals(alleleCount, 87);
final Genotype g = variant.getGenotype("HG00129");
- Assert.assertTrue(g.isHomVar());
+ Assert.assertTrue(g.isHet());
Assert.assertEquals(VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, -1), 2);
- Assert.assertEquals(VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, -1), 0);
+ Assert.assertEquals(VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, -1), 1);
}
}
Assert.assertEquals(expectedRecordsFound, 1);
@@ -533,5 +540,34 @@ public void testAllosome() {
}
Assert.assertEquals(expectedRecordsFound, 1);
}
+ @Test
+ public void testCleanedVcf() {
+ final File output = createTempFile("cleaned_vcf_cluster", ".vcf");
+ // Note we use very loose clustering criteria on a normal cleaned vcf to ensure some clustering happens
+ final ArgumentsBuilder args = new ArgumentsBuilder()
+ .addOutput(output)
+ .addVCF(getToolTestDataDir() + "bwa_melt.cleaned.chr22_chrY.vcf.gz")
+ .add(SVCluster.PLOIDY_TABLE_LONG_NAME, getToolTestDataDir() + "1kgp.batch1.ploidy.tsv")
+ .add(SVCluster.VARIANT_PREFIX_LONG_NAME, "SVx")
+ .add(SVCluster.ALGORITHM_LONG_NAME, SVCluster.CLUSTER_ALGORITHM.SINGLE_LINKAGE)
+ .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, REFERENCE_PATH)
+ .add(SVClusterEngineArgumentsCollection.DEPTH_SAMPLE_OVERLAP_FRACTION_NAME, 0)
+ .add(SVClusterEngineArgumentsCollection.DEPTH_INTERVAL_OVERLAP_FRACTION_NAME, 0.1)
+ .add(SVClusterEngineArgumentsCollection.DEPTH_BREAKEND_WINDOW_NAME, 10000000)
+ .add(SVClusterEngineArgumentsCollection.MIXED_SAMPLE_OVERLAP_FRACTION_NAME, 0)
+ .add(SVClusterEngineArgumentsCollection.MIXED_INTERVAL_OVERLAP_FRACTION_NAME, 0.1)
+ .add(SVClusterEngineArgumentsCollection.MIXED_BREAKEND_WINDOW_NAME, 5000)
+ .add(SVClusterEngineArgumentsCollection.PESR_SAMPLE_OVERLAP_FRACTION_NAME, 0)
+ .add(SVClusterEngineArgumentsCollection.PESR_INTERVAL_OVERLAP_FRACTION_NAME, 0.1)
+ .add(SVClusterEngineArgumentsCollection.PESR_BREAKEND_WINDOW_NAME, 5000);
+
+ runCommandLine(args, SVCluster.class.getSimpleName());
+
+ final Pair> vcf = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath());
+ final VCFHeader header = vcf.getKey();
+ Assert.assertEquals(header.getSampleNamesInOrder().size(), 161);
+ final List records = vcf.getValue();
+ Assert.assertEquals(records.size(), 1227);
+ }
}
\ No newline at end of file
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratifyIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratifyIntegrationTest.java
new file mode 100644
index 00000000000..c3badd60497
--- /dev/null
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVStratifyIntegrationTest.java
@@ -0,0 +1,208 @@
+package org.broadinstitute.hellbender.tools.walkers.sv;
+
+import com.google.common.collect.Lists;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.vcf.VCFHeader;
+import org.apache.commons.lang3.tuple.Pair;
+import org.broadinstitute.hellbender.CommandLineProgramTest;
+import org.broadinstitute.hellbender.GATKBaseTest;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
+import org.broadinstitute.hellbender.testutils.VariantContextTestUtils;
+import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberStandardArgument;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
+import org.broadinstitute.hellbender.tools.sv.stratify.SVStratificationEngineArgumentsCollection;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+import picard.vcf.VcfUtils;
+
+import java.io.File;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+public class SVStratifyIntegrationTest extends CommandLineProgramTest {
+
+ @Test
+ public void testBwaMeltCohort() {
+ final File outputDir = createTempDir("stratify");
+ final String inputVcfPath = getToolTestDataDir() + "bwa_melt.chr22.vcf.gz";
+ final String configFile = getToolTestDataDir() + "test_config.tsv";
+
+ final String segdupFile = getToolTestDataDir() + "hg38.SegDup.chr22.bed";
+ final String segdupName = "SD";
+ final String repeatmaskerFile = getToolTestDataDir() + "hg38.RM.chr22_subsampled.bed";
+ final String repeatmaskerName = "RM";
+
+ final ArgumentsBuilder args = new ArgumentsBuilder()
+ .addOutput(outputDir)
+ .add(CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME, "test")
+ .add(SVStratify.SPLIT_OUTPUT_LONG_NAME, true)
+ .add(SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME, configFile)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, segdupName)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, segdupFile)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, repeatmaskerName)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, repeatmaskerFile)
+ .add(SVStratificationEngineArgumentsCollection.OVERLAP_FRACTION_LONG_NAME, 0.5)
+ .add(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, GATKBaseTest.FULL_HG38_DICT)
+ .add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVcfPath);
+
+ runCommandLine(args, SVStratify.class.getSimpleName());
+
+ final File[] outputFiles = outputDir.listFiles();
+ Assert.assertEquals(outputFiles.length, 14);
+ final Map expectedOutputSuffixes = new HashMap<>();
+ expectedOutputSuffixes.put("INS_small_SD", 46);
+ expectedOutputSuffixes.put("DEL_50_5k_both", 110);
+ expectedOutputSuffixes.put("DEL_5k_50k_SD", 2);
+ expectedOutputSuffixes.put("DUP_lt5kb_RM", 0);
+ expectedOutputSuffixes.put("INV_gt1kb", 26);
+ expectedOutputSuffixes.put("BND_SD", 77);
+ expectedOutputSuffixes.put(SVStratify.DEFAULT_STRATUM, 1196);
+ int numVcfs = 0;
+ int totalRecords = 0;
+ for (final File file : outputFiles) {
+ if (VcfUtils.isVariantFile(file)) {
+ ++numVcfs;
+ final Pair> outputVcf = VariantContextTestUtils.readEntireVCFIntoMemory(file.getAbsolutePath());
+ boolean foundSuffix = false;
+ for (final String suffix : expectedOutputSuffixes.keySet()) {
+ if (file.toString().contains("." + suffix + ".")) {
+ foundSuffix = true;
+ for (final VariantContext variant : outputVcf.getRight()) {
+ Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.STRATUM_INFO_KEY));
+ Assert.assertEquals(variant.getAttribute(GATKSVVCFConstants.STRATUM_INFO_KEY), suffix);
+ }
+ final int expectedSize = expectedOutputSuffixes.get(suffix).intValue();
+ final int actualSize = outputVcf.getRight().size();
+ Assert.assertEquals(actualSize, expectedSize,
+ "Expected " + expectedSize + " records but found " + actualSize + " in " + suffix);
+ totalRecords += actualSize;
+ break;
+ }
+ }
+ Assert.assertTrue(foundSuffix, "Unexpected file suffix: " + file.getAbsolutePath());
+ }
+ }
+ Assert.assertEquals(numVcfs, 7);
+ final int numInputRecords = VariantContextTestUtils.readEntireVCFIntoMemory(inputVcfPath).getRight().size();
+ Assert.assertEquals(totalRecords, numInputRecords);
+ }
+
+ @Test
+ public void testBwaMeltCohortSingleOutput() {
+ final File outputDir = createTempDir("stratify");
+ final File outputFile = outputDir.toPath().resolve("out.vcf.gz").toFile();
+ final String inputVcfPath = getToolTestDataDir() + "bwa_melt.chr22.vcf.gz";
+ final String configFile = getToolTestDataDir() + "test_config.tsv";
+
+ final String segdupFile = getToolTestDataDir() + "hg38.SegDup.chr22.bed";
+ final String segdupName = "SD";
+ final String repeatmaskerFile = getToolTestDataDir() + "hg38.RM.chr22_subsampled.bed";
+ final String repeatmaskerName = "RM";
+
+ final ArgumentsBuilder args = new ArgumentsBuilder()
+ .addOutput(outputFile)
+ .add(SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME, configFile)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, segdupName)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, segdupFile)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, repeatmaskerName)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, repeatmaskerFile)
+ .add(SVStratificationEngineArgumentsCollection.OVERLAP_FRACTION_LONG_NAME, 0.5)
+ .add(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, GATKBaseTest.FULL_HG38_DICT)
+ .add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVcfPath);
+
+ runCommandLine(args, SVStratify.class.getSimpleName());
+
+ final List outputFiles = Lists.newArrayList(outputDir.listFiles()).stream().filter(VcfUtils::isVariantFile).collect(Collectors.toUnmodifiableList());
+ Assert.assertEquals(outputFiles.size(), 1);
+ Assert.assertEquals(outputFiles.get(0).getAbsolutePath(), outputFile.getAbsolutePath());
+ final Pair> inputVcf = VariantContextTestUtils.readEntireVCFIntoMemory(inputVcfPath);
+ final Pair> outputVcf = VariantContextTestUtils.readEntireVCFIntoMemory(outputFile.getAbsolutePath());
+ Assert.assertEquals(outputVcf.getRight().size(), inputVcf.getRight().size());
+ }
+
+ @Test(expectedExceptions = GATKException.class)
+ public void testBwaMeltCohortRedundant() {
+ final File outputDir = createTempDir("stratify");
+ final String inputVcfPath = getToolTestDataDir() + "bwa_melt.chr22.vcf.gz";
+ final String configFile = getToolTestDataDir() + "test_config_redundant.tsv";
+
+ final String segdupFile = getToolTestDataDir() + "hg38.SegDup.chr22.bed";
+ final String segdupName = "SD";
+ final String repeatmaskerFile = getToolTestDataDir() + "hg38.RM.chr22_subsampled.bed";
+ final String repeatmaskerName = "RM";
+
+ final ArgumentsBuilder args = new ArgumentsBuilder()
+ .addOutput(outputDir)
+ .add(CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME, "test")
+ .add(SVStratify.SPLIT_OUTPUT_LONG_NAME, true)
+ .add(SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME, configFile)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, segdupName)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, segdupFile)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, repeatmaskerName)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, repeatmaskerFile)
+ .add(SVStratificationEngineArgumentsCollection.OVERLAP_FRACTION_LONG_NAME, 0.5)
+ .add(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, GATKBaseTest.FULL_HG38_DICT)
+ .add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVcfPath);
+
+ runCommandLine(args, SVStratify.class.getSimpleName());
+ }
+
+ @Test
+ public void testBwaMeltCohortBypassRedundant() {
+ final File outputDir = createTempDir("stratify");
+ final String inputVcfPath = getToolTestDataDir() + "bwa_melt.chr22.vcf.gz";
+ final String configFile = getToolTestDataDir() + "test_config_redundant.tsv";
+
+ final String segdupFile = getToolTestDataDir() + "hg38.SegDup.chr22.bed";
+ final String segdupName = "SD";
+ final String repeatmaskerFile = getToolTestDataDir() + "hg38.RM.chr22_subsampled.bed";
+ final String repeatmaskerName = "RM";
+
+ final ArgumentsBuilder args = new ArgumentsBuilder()
+ .addOutput(outputDir)
+ .add(CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME, "test")
+ .add(SVStratify.SPLIT_OUTPUT_LONG_NAME, true)
+ .add(SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME, configFile)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, segdupName)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, segdupFile)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, repeatmaskerName)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, repeatmaskerFile)
+ .add(SVStratificationEngineArgumentsCollection.OVERLAP_FRACTION_LONG_NAME, 0.5)
+ .add(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, GATKBaseTest.FULL_HG38_DICT)
+ .add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVcfPath)
+ .addFlag(SVStratify.ALLOW_MULTIPLE_MATCHES_LONG_NAME);
+
+ runCommandLine(args, SVStratify.class.getSimpleName());
+ }
+
+ @Test(expectedExceptions = {GATKException.class})
+ public void testBwaMeltCohortDuplicateContextName() {
+ final File outputDir = createTempDir("stratify");
+ final String inputVcfPath = getToolTestDataDir() + "bwa_melt.chr22.vcf.gz";
+ final String configFile = getToolTestDataDir() + "test_config_duplicate.tsv";
+
+ final String segdupFile = getToolTestDataDir() + "hg38.SegDup.chr22.bed";
+ final String segdupName = "SD";
+ final String repeatmaskerFile = getToolTestDataDir() + "hg38.RM.chr22_subsampled.bed";
+ final String repeatmaskerName = "RM";
+
+ final ArgumentsBuilder args = new ArgumentsBuilder()
+ .addOutput(outputDir)
+ .add(CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME, "test")
+ .add(SVStratify.SPLIT_OUTPUT_LONG_NAME, true)
+ .add(SVStratificationEngineArgumentsCollection.STRATIFY_CONFIG_FILE_LONG_NAME, configFile)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, segdupName)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, segdupFile)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_NAME_FILE_LONG_NAME, repeatmaskerName)
+ .add(SVStratificationEngineArgumentsCollection.TRACK_INTERVAL_FILE_LONG_NAME, repeatmaskerFile)
+ .add(SVStratificationEngineArgumentsCollection.OVERLAP_FRACTION_LONG_NAME, 0.5)
+ .add(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, GATKBaseTest.FULL_HG38_DICT)
+ .add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVcfPath);
+
+ runCommandLine(args, SVStratify.class.getSimpleName());
+ }
+}
\ No newline at end of file
diff --git a/src/test/java/org/broadinstitute/hellbender/utils/variant/GATKVariantContextUtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/variant/GATKVariantContextUtilsUnitTest.java
index baefc91548e..a4c6559b627 100644
--- a/src/test/java/org/broadinstitute/hellbender/utils/variant/GATKVariantContextUtilsUnitTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/utils/variant/GATKVariantContextUtilsUnitTest.java
@@ -659,6 +659,27 @@ public void testMergeGenotypesUniquify() {
Assert.assertEquals(merged.getSampleNames(), new LinkedHashSet<>(Arrays.asList("s1.1", "s1.2")));
}
+ @Test
+ public void testMergeSourceIDs() {
+ final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1));
+ final VariantContext vc1WithSource= new VariantContextBuilder(vc1).source("source1").make();
+ final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2));
+ final VariantContext vc2WithSource = new VariantContextBuilder(vc2).source("source2").make();
+ final VariantContext merged = GATKVariantContextUtils.simpleMerge(
+ Arrays.asList(vc1WithSource, vc2WithSource), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
+ GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, true, -1);
+
+ // test sources are merged
+ Assert.assertEquals(merged.getSource(), "source1-source2");
+
+ final VariantContext mergedTruncated = GATKVariantContextUtils.simpleMerge(
+ Arrays.asList(vc1WithSource, vc2WithSource), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
+ GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, true, 2);
+
+ // test sources are merged and respects maxSourceFieldLength
+ Assert.assertEquals(mergedTruncated.getSource(), "so");
+ }
+
// TODO: remove after testing
// @Test(expectedExceptions = IllegalStateException.class)
// public void testMergeGenotypesRequireUnique() {
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/sv/sv_stratify_config.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/sv/sv_stratify_config.tsv
new file mode 100644
index 00000000000..b75575e45ae
--- /dev/null
+++ b/src/test/resources/org/broadinstitute/hellbender/tools/sv/sv_stratify_config.tsv
@@ -0,0 +1,8 @@
+NAME SVTYPE MIN_SIZE MAX_SIZE TRACKS
+INS_context1 INS -1 -1 context1
+DEL_50_5k_both DEL 50 5000 context1,context2
+DEL_5k_50k_context1 DEL 5000 50000 context1
+DUP_lt5kb_context1 DUP -1 5000 context1
+INV_gt1kb INV 1000 -1 NULL
+BND_context1 BND -1 -1 context1
+CTX_context1 CTX -1 -1 context1
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster/stratified_cluster_params.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster/stratified_cluster_params.tsv
new file mode 100644
index 00000000000..b94357460a6
--- /dev/null
+++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster/stratified_cluster_params.tsv
@@ -0,0 +1,3 @@
+NAME RECIPROCAL_OVERLAP SIZE_SIMILARITY BREAKEND_WINDOW SAMPLE_OVERLAP
+INS_small_SD 0.1 0.5 50 0
+DEL_50_5k_SD_RM 0.1 0.5 1000 0
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster/stratified_cluster_strata.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster/stratified_cluster_strata.tsv
new file mode 100644
index 00000000000..9cb836c9d33
--- /dev/null
+++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/GroupedSVCluster/stratified_cluster_strata.tsv
@@ -0,0 +1,3 @@
+NAME SVTYPE MIN_SIZE MAX_SIZE TRACKS
+INS_small_SD INS -1 -1 SD
+DEL_50_5k_SD_RM DEL 50 5000 SD,RM
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/bwa_melt.cleaned.chr22_chrY.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/bwa_melt.cleaned.chr22_chrY.vcf.gz
new file mode 100644
index 00000000000..ebf9af067ff
Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/bwa_melt.cleaned.chr22_chrY.vcf.gz differ
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/bwa_melt.cleaned.chr22_chrY.vcf.gz.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/bwa_melt.cleaned.chr22_chrY.vcf.gz.tbi
new file mode 100644
index 00000000000..c394f2c6989
Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/bwa_melt.cleaned.chr22_chrY.vcf.gz.tbi differ
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance/ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance/ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz
index 0fc84b559b1..8632e8cbcd2 100644
Binary files a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance/ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance/ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz differ
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance/ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance/ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz.tbi
index 5827787165f..191d92f3007 100644
Binary files a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance/ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz.tbi and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance/ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz.tbi differ
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/bwa_melt.chr22.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/bwa_melt.chr22.vcf.gz
new file mode 100644
index 00000000000..9a85bb047b6
Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/bwa_melt.chr22.vcf.gz differ
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/bwa_melt.chr22.vcf.gz.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/bwa_melt.chr22.vcf.gz.tbi
new file mode 100644
index 00000000000..86dc452a4ad
Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/bwa_melt.chr22.vcf.gz.tbi differ
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/hg38.RM.chr22_subsampled.bed b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/hg38.RM.chr22_subsampled.bed
new file mode 100644
index 00000000000..999a8da1b4c
--- /dev/null
+++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/hg38.RM.chr22_subsampled.bed
@@ -0,0 +1,1000 @@
+chr22 10548844 10549072
+chr22 10600155 10600379
+chr22 10609936 10610026
+chr22 10680159 10684708
+chr22 10731111 10731244
+chr22 10782698 10783246
+chr22 11032485 11032795
+chr22 11053141 11053245
+chr22 11056252 11056541
+chr22 11264429 11264489
+chr22 11354245 11354759
+chr22 11480660 11480856
+chr22 11485802 11485889
+chr22 11814558 11816405
+chr22 11877778 11878081
+chr22 11879854 11880179
+chr22 11929231 11929483
+chr22 12066840 12067240
+chr22 12101013 12101573
+chr22 12277861 12278206
+chr22 12310868 12310957
+chr22 12323627 12323767
+chr22 12558845 12559121
+chr22 12574259 12574469
+chr22 12582498 12582554
+chr22 12871618 12872075
+chr22 12880177 12880513
+chr22 12883467 12883802
+chr22 12888865 12889065
+chr22 15223165 15223252
+chr22 15309236 15309325
+chr22 15323774 15323965
+chr22 15485028 15485135
+chr22 15499798 15499999
+chr22 15590783 15591195
+chr22 15594920 15594975
+chr22 15717078 15717160
+chr22 15756004 15756039
+chr22 15771644 15771788
+chr22 15803438 15805331
+chr22 15831943 15832771
+chr22 15856856 15856897
+chr22 16073496 16085556
+chr22 16235053 16241190
+chr22 16341035 16341082
+chr22 16436337 16436641
+chr22 16522669 16522956
+chr22 16569061 16570644
+chr22 16579154 16579393
+chr22 16588039 16588645
+chr22 16624386 16624679
+chr22 16649026 16649063
+chr22 16716314 16716812
+chr22 16733932 16734095
+chr22 16757884 16759904
+chr22 16765013 16766359
+chr22 16773365 16773644
+chr22 16812343 16812932
+chr22 16922577 16923317
+chr22 17053575 17055162
+chr22 17099401 17099577
+chr22 17123718 17123756
+chr22 17123860 17124390
+chr22 17162538 17162584
+chr22 17192233 17192324
+chr22 17219665 17219975
+chr22 17231307 17231631
+chr22 17289154 17289255
+chr22 17307511 17307713
+chr22 17345763 17346002
+chr22 17424135 17424269
+chr22 17502119 17502298
+chr22 17586746 17586774
+chr22 17622685 17622996
+chr22 17631522 17631618
+chr22 17635470 17635585
+chr22 17719382 17720182
+chr22 17740160 17740924
+chr22 17761783 17761859
+chr22 17834434 17834561
+chr22 17889544 17889827
+chr22 17937381 17937690
+chr22 17969044 17969338
+chr22 18006551 18006852
+chr22 18028500 18028590
+chr22 18029082 18029800
+chr22 18175694 18176298
+chr22 18219880 18219923
+chr22 18236756 18237477
+chr22 18350047 18350201
+chr22 18393343 18394705
+chr22 18412200 18412296
+chr22 18519917 18520001
+chr22 18525505 18526658
+chr22 18544911 18545084
+chr22 18602476 18602976
+chr22 18618302 18618432
+chr22 18782455 18782585
+chr22 18789650 18789809
+chr22 18791336 18791659
+chr22 18930115 18930281
+chr22 19020829 19021146
+chr22 19074769 19075115
+chr22 19079365 19079539
+chr22 19122416 19122479
+chr22 19239936 19240262
+chr22 19264258 19264515
+chr22 19285483 19285618
+chr22 19412935 19413146
+chr22 19423144 19423261
+chr22 19425679 19426053
+chr22 19529955 19532958
+chr22 19614049 19614150
+chr22 19616914 19617330
+chr22 19703132 19703437
+chr22 19868564 19868856
+chr22 19881943 19882154
+chr22 19882823 19882847
+chr22 19917357 19917461
+chr22 19958599 19958908
+chr22 19997920 19997950
+chr22 20047588 20048524
+chr22 20090527 20090757
+chr22 20182900 20182943
+chr22 20186025 20186060
+chr22 20220167 20220250
+chr22 20385600 20386176
+chr22 20442982 20443216
+chr22 20671842 20672157
+chr22 20688300 20688577
+chr22 20698245 20698539
+chr22 20716963 20717151
+chr22 20770579 20770632
+chr22 20797635 20797751
+chr22 20902600 20902905
+chr22 20941588 20941824
+chr22 21061659 21062388
+chr22 21132602 21132648
+chr22 21138170 21138284
+chr22 21164124 21165687
+chr22 21245971 21246024
+chr22 21316132 21316331
+chr22 21361921 21362255
+chr22 21400616 21400737
+chr22 21434587 21434702
+chr22 21461288 21461573
+chr22 21463519 21463670
+chr22 21466663 21466779
+chr22 21479125 21479456
+chr22 21553026 21553349
+chr22 21556425 21556761
+chr22 21589142 21589445
+chr22 21639819 21640102
+chr22 21701433 21701463
+chr22 21718903 21719107
+chr22 21775784 21776083
+chr22 21831369 21831481
+chr22 21836073 21836376
+chr22 21848700 21848889
+chr22 21872807 21873246
+chr22 21891165 21891946
+chr22 21905524 21906250
+chr22 22050964 22057732
+chr22 22082044 22083267
+chr22 22121138 22121179
+chr22 22129648 22129755
+chr22 22164599 22164628
+chr22 22218589 22219103
+chr22 22495319 22495644
+chr22 22502784 22503399
+chr22 22504151 22504451
+chr22 22542826 22543564
+chr22 22565742 22566184
+chr22 22577836 22577960
+chr22 22648947 22649064
+chr22 22699544 22699764
+chr22 22763058 22763414
+chr22 22773790 22773969
+chr22 22793388 22793691
+chr22 22821356 22821690
+chr22 22870885 22871146
+chr22 22874608 22875120
+chr22 22897694 22897982
+chr22 23007411 23007437
+chr22 23132162 23132268
+chr22 23155335 23155434
+chr22 23210141 23210438
+chr22 23211497 23211784
+chr22 23320710 23321000
+chr22 23340561 23340866
+chr22 23345739 23345841
+chr22 23354792 23354955
+chr22 23364979 23365063
+chr22 23485148 23485264
+chr22 23529915 23530118
+chr22 23604796 23605118
+chr22 23608743 23608792
+chr22 23623122 23623429
+chr22 23645005 23645150
+chr22 23647367 23648106
+chr22 23648786 23649066
+chr22 23653551 23653810
+chr22 23655810 23655918
+chr22 23656994 23657307
+chr22 23771428 23771724
+chr22 23782016 23782139
+chr22 23804540 23804840
+chr22 23821434 23822002
+chr22 23840019 23840959
+chr22 23844212 23844249
+chr22 23847086 23847387
+chr22 23873302 23873329
+chr22 23897728 23897772
+chr22 23916791 23916844
+chr22 23916853 23917151
+chr22 23935971 23936014
+chr22 23978581 23978867
+chr22 23994211 23994639
+chr22 24047382 24047506
+chr22 24103759 24103790
+chr22 24137140 24137167
+chr22 24173428 24173547
+chr22 24203943 24204595
+chr22 24205135 24205305
+chr22 24210891 24212328
+chr22 24255042 24255345
+chr22 24380932 24381051
+chr22 24383294 24383415
+chr22 24424361 24424398
+chr22 24492032 24492098
+chr22 24551459 24552386
+chr22 24699562 24699706
+chr22 24728884 24729357
+chr22 24839371 24841737
+chr22 24970607 24970624
+chr22 24970876 24970937
+chr22 24976297 24976446
+chr22 25099930 25100029
+chr22 25109577 25109632
+chr22 25127039 25127224
+chr22 25130349 25130382
+chr22 25131042 25131144
+chr22 25153813 25153910
+chr22 25187512 25187886
+chr22 25204147 25204315
+chr22 25222687 25223012
+chr22 25224528 25224774
+chr22 25232497 25232672
+chr22 25236404 25236527
+chr22 25290873 25291544
+chr22 25325040 25325146
+chr22 25333316 25333623
+chr22 25365035 25365464
+chr22 25385771 25385864
+chr22 25387400 25387433
+chr22 25447307 25447615
+chr22 25462934 25463153
+chr22 25468278 25468437
+chr22 25471172 25471244
+chr22 25471678 25471718
+chr22 25489661 25489880
+chr22 25506902 25506943
+chr22 25514575 25514679
+chr22 25522465 25522536
+chr22 25523158 25523453
+chr22 25623794 25623900
+chr22 25684884 25685075
+chr22 25699118 25699192
+chr22 25778974 25779206
+chr22 25779269 25779406
+chr22 25792075 25792124
+chr22 25798550 25798864
+chr22 25801576 25801695
+chr22 25826777 25827082
+chr22 25828077 25828303
+chr22 25840236 25840633
+chr22 25865514 25865758
+chr22 25949759 25949805
+chr22 25955692 25955874
+chr22 26004081 26004166
+chr22 26012056 26012342
+chr22 26018670 26018968
+chr22 26035184 26035595
+chr22 26079579 26079749
+chr22 26105361 26105434
+chr22 26131043 26131182
+chr22 26133372 26133621
+chr22 26167276 26167791
+chr22 26182013 26182155
+chr22 26188864 26188929
+chr22 26189817 26190126
+chr22 26200324 26200423
+chr22 26253919 26254314
+chr22 26309875 26310021
+chr22 26310070 26310590
+chr22 26314441 26314603
+chr22 26348077 26348190
+chr22 26405271 26405417
+chr22 26416724 26416760
+chr22 26425678 26426074
+chr22 26426415 26426809
+chr22 26434712 26434902
+chr22 26497474 26498687
+chr22 26559017 26559708
+chr22 26563156 26563297
+chr22 26653442 26654242
+chr22 26667161 26667265
+chr22 26676627 26676737
+chr22 26721609 26721830
+chr22 26743272 26743453
+chr22 26832554 26832948
+chr22 26872117 26874640
+chr22 26888939 26889127
+chr22 26889140 26889413
+chr22 26896135 26896351
+chr22 26908893 26909291
+chr22 26937973 26938087
+chr22 26959677 26959840
+chr22 26972578 26972768
+chr22 26976839 26977262
+chr22 27054525 27054697
+chr22 27118989 27119358
+chr22 27125182 27125301
+chr22 27149328 27149420
+chr22 27154121 27154413
+chr22 27155709 27156029
+chr22 27203271 27203343
+chr22 27208639 27208846
+chr22 27251103 27251392
+chr22 27254263 27254561
+chr22 27314767 27315129
+chr22 27332833 27332945
+chr22 27397534 27397696
+chr22 27479120 27479142
+chr22 27498898 27498945
+chr22 27544436 27544747
+chr22 27555680 27556723
+chr22 27595098 27595399
+chr22 27604216 27604320
+chr22 27615766 27615843
+chr22 27710176 27710269
+chr22 27721253 27721406
+chr22 27723543 27723731
+chr22 27726072 27726218
+chr22 27814110 27814976
+chr22 27839763 27840785
+chr22 27880022 27880212
+chr22 27883487 27883603
+chr22 28015118 28015209
+chr22 28024082 28024186
+chr22 28064438 28064633
+chr22 28176743 28178163
+chr22 28187992 28188124
+chr22 28251637 28251748
+chr22 28289749 28290033
+chr22 28310641 28310700
+chr22 28329337 28329406
+chr22 28381134 28381162
+chr22 28401643 28401778
+chr22 28409528 28409629
+chr22 28415006 28415201
+chr22 28418923 28419082
+chr22 28430036 28430060
+chr22 28455431 28455730
+chr22 28513640 28513818
+chr22 28518908 28518995
+chr22 28650024 28650444
+chr22 28701822 28701953
+chr22 28775616 28775913
+chr22 28792574 28792856
+chr22 28833119 28833382
+chr22 28833852 28834020
+chr22 28834758 28834829
+chr22 28912970 28913120
+chr22 28938422 28938564
+chr22 28981491 28982242
+chr22 29081893 29082086
+chr22 29143460 29143755
+chr22 29163942 29164113
+chr22 29170703 29171432
+chr22 29174297 29174518
+chr22 29297142 29297369
+chr22 29398153 29398318
+chr22 29406314 29406548
+chr22 29501553 29501859
+chr22 29514716 29515004
+chr22 29535761 29535966
+chr22 29557419 29557729
+chr22 29566176 29566295
+chr22 29580406 29580553
+chr22 29581239 29581340
+chr22 29586474 29586767
+chr22 29617663 29617825
+chr22 29634162 29634260
+chr22 29665682 29666520
+chr22 29715967 29716040
+chr22 29718253 29718457
+chr22 29719525 29719610
+chr22 29724468 29724585
+chr22 29725438 29725856
+chr22 29830353 29830897
+chr22 29832587 29832904
+chr22 29875852 29876389
+chr22 29880939 29881125
+chr22 29921550 29921840
+chr22 29972875 29973055
+chr22 29985217 29985524
+chr22 30010195 30010275
+chr22 30061250 30062064
+chr22 30074019 30074319
+chr22 30139419 30139483
+chr22 30215968 30216128
+chr22 30218486 30218618
+chr22 30228070 30228602
+chr22 30249672 30249811
+chr22 30250107 30250329
+chr22 30261489 30261656
+chr22 30275498 30275569
+chr22 30326885 30326962
+chr22 30388467 30388727
+chr22 30426541 30426668
+chr22 30519418 30519724
+chr22 30603832 30604756
+chr22 30618339 30618650
+chr22 30621834 30621889
+chr22 30635628 30635664
+chr22 30650005 30650048
+chr22 30684209 30684324
+chr22 30692329 30693031
+chr22 30701318 30701642
+chr22 30814859 30814964
+chr22 30833889 30836255
+chr22 30840527 30840814
+chr22 30882409 30882545
+chr22 30890035 30890251
+chr22 30908477 30908554
+chr22 30938879 30939097
+chr22 30949286 30949371
+chr22 30969991 30970046
+chr22 30998146 30998676
+chr22 31015923 31016407
+chr22 31026247 31026300
+chr22 31139624 31139849
+chr22 31148061 31148545
+chr22 31174379 31174667
+chr22 31309396 31309706
+chr22 31309735 31310541
+chr22 31320793 31320991
+chr22 31421027 31421421
+chr22 31513556 31514163
+chr22 31536405 31536712
+chr22 31547159 31547189
+chr22 31551058 31551235
+chr22 31587837 31587898
+chr22 31660378 31660686
+chr22 31702215 31702520
+chr22 31721636 31721835
+chr22 31729003 31729102
+chr22 31729642 31729888
+chr22 31774204 31774498
+chr22 31819464 31819769
+chr22 31869414 31869586
+chr22 31873507 31873601
+chr22 31894208 31894493
+chr22 31951219 31951254
+chr22 31969166 31969543
+chr22 31989462 31989790
+chr22 32127012 32127492
+chr22 32162804 32162991
+chr22 32164477 32164787
+chr22 32166290 32166768
+chr22 32186803 32186939
+chr22 32213208 32213454
+chr22 32218125 32218436
+chr22 32326693 32327276
+chr22 32358668 32358742
+chr22 32424789 32425081
+chr22 32437379 32437690
+chr22 32483376 32483736
+chr22 32565535 32565688
+chr22 32586190 32586800
+chr22 32628594 32628761
+chr22 32639944 32640383
+chr22 32757104 32758743
+chr22 32820909 32821039
+chr22 32832739 32833031
+chr22 32914273 32914836
+chr22 32925863 32926151
+chr22 32953816 32953842
+chr22 32981675 32981795
+chr22 32982556 32982754
+chr22 32997166 32997260
+chr22 33055955 33056011
+chr22 33056342 33056661
+chr22 33064295 33064545
+chr22 33131273 33131312
+chr22 33235487 33235824
+chr22 33241110 33241506
+chr22 33257778 33258719
+chr22 33265495 33265721
+chr22 33272361 33272394
+chr22 33275182 33275777
+chr22 33280099 33280349
+chr22 33292748 33292941
+chr22 33324896 33325119
+chr22 33423800 33424047
+chr22 33442056 33442231
+chr22 33503244 33503406
+chr22 33550336 33551263
+chr22 33582064 33582259
+chr22 33606658 33607090
+chr22 33623748 33624056
+chr22 33698325 33698604
+chr22 33713966 33714035
+chr22 33726866 33726930
+chr22 33816689 33816762
+chr22 33818283 33818417
+chr22 33831749 33831796
+chr22 33890146 33890391
+chr22 33948751 33949070
+chr22 34033459 34033810
+chr22 34040609 34040758
+chr22 34071428 34071598
+chr22 34089724 34089746
+chr22 34092699 34093727
+chr22 34106568 34106645
+chr22 34109431 34109774
+chr22 34113104 34113314
+chr22 34149018 34149061
+chr22 34171331 34171543
+chr22 34200598 34200856
+chr22 34224859 34224940
+chr22 34251235 34255889
+chr22 34283404 34283456
+chr22 34293881 34294060
+chr22 34356468 34356706
+chr22 34397943 34398715
+chr22 34416990 34417052
+chr22 34417179 34418004
+chr22 34483080 34483156
+chr22 34490913 34490957
+chr22 34513897 34514305
+chr22 34700944 34701017
+chr22 34715060 34715206
+chr22 34737784 34738354
+chr22 34754186 34754271
+chr22 34762187 34762362
+chr22 34796484 34796554
+chr22 34799179 34799292
+chr22 34800886 34801054
+chr22 34849442 34851769
+chr22 34865917 34866124
+chr22 34872347 34872604
+chr22 34873289 34874068
+chr22 34962040 34962207
+chr22 34970312 34971067
+chr22 35019623 35019868
+chr22 35034142 35034284
+chr22 35035736 35035796
+chr22 35078231 35078260
+chr22 35079420 35079560
+chr22 35098779 35099164
+chr22 35100348 35100530
+chr22 35110163 35110247
+chr22 35128852 35128947
+chr22 35157294 35157407
+chr22 35228643 35228965
+chr22 35385318 35385425
+chr22 35390845 35390941
+chr22 35418686 35418724
+chr22 35463311 35463840
+chr22 35469270 35469367
+chr22 35586139 35586289
+chr22 35604857 35604880
+chr22 35643333 35643371
+chr22 35696052 35696400
+chr22 35706868 35707522
+chr22 35763279 35763575
+chr22 35801419 35801472
+chr22 35812489 35812511
+chr22 35820339 35820714
+chr22 35843823 35843909
+chr22 35852384 35852502
+chr22 35896276 35896315
+chr22 35917108 35917220
+chr22 35945449 35945736
+chr22 35976044 35976183
+chr22 35991902 35992144
+chr22 36008534 36008837
+chr22 36048286 36048595
+chr22 36110223 36110344
+chr22 36112554 36112708
+chr22 36113494 36113788
+chr22 36145905 36146038
+chr22 36150589 36150889
+chr22 36198632 36198797
+chr22 36253250 36253375
+chr22 36396512 36397064
+chr22 36397507 36397548
+chr22 36431929 36431962
+chr22 36456993 36457082
+chr22 36480393 36480532
+chr22 36492510 36492817
+chr22 36502202 36502266
+chr22 36518090 36518134
+chr22 36551201 36551496
+chr22 36608499 36608615
+chr22 36623767 36623835
+chr22 36643461 36643524
+chr22 36694275 36694348
+chr22 36705672 36706035
+chr22 36722368 36722427
+chr22 36762134 36762206
+chr22 36807171 36807334
+chr22 36811558 36811592
+chr22 36856787 36856858
+chr22 36863285 36863373
+chr22 36927900 36927998
+chr22 36941958 36942148
+chr22 36943615 36943798
+chr22 36963451 36963727
+chr22 36978440 36978601
+chr22 37002607 37002668
+chr22 37006725 37007239
+chr22 37019520 37019560
+chr22 37034529 37034837
+chr22 37043508 37043722
+chr22 37048854 37048891
+chr22 37069526 37069693
+chr22 37092193 37092795
+chr22 37096165 37096380
+chr22 37098671 37098726
+chr22 37119762 37119972
+chr22 37146953 37147109
+chr22 37150439 37152596
+chr22 37183495 37183640
+chr22 37206328 37206456
+chr22 37224628 37224690
+chr22 37260022 37260446
+chr22 37263116 37263789
+chr22 37324339 37324560
+chr22 37358954 37359089
+chr22 37449157 37449241
+chr22 37470125 37470335
+chr22 37471548 37471855
+chr22 37514997 37515169
+chr22 37515606 37515816
+chr22 37520821 37520985
+chr22 37613294 37613824
+chr22 37644484 37644584
+chr22 37667094 37667530
+chr22 37668168 37668768
+chr22 37679155 37679444
+chr22 37685404 37685574
+chr22 37712322 37713069
+chr22 37768824 37768970
+chr22 37843141 37843452
+chr22 37848085 37848134
+chr22 37849787 37849947
+chr22 37870717 37871147
+chr22 37884625 37884862
+chr22 37958187 37958493
+chr22 37964558 37964872
+chr22 38003577 38003874
+chr22 38005711 38005935
+chr22 38005940 38006227
+chr22 38031186 38031302
+chr22 38038729 38038796
+chr22 38075742 38075916
+chr22 38087789 38087834
+chr22 38166777 38167269
+chr22 38176686 38177127
+chr22 38214830 38214866
+chr22 38217360 38217670
+chr22 38273599 38275811
+chr22 38290858 38290955
+chr22 38317282 38317308
+chr22 38317313 38317375
+chr22 38318745 38319240
+chr22 38333185 38333309
+chr22 38349678 38350251
+chr22 38370294 38370755
+chr22 38413410 38413512
+chr22 38421979 38422397
+chr22 38516613 38516816
+chr22 38571789 38572071
+chr22 38588641 38588957
+chr22 38679410 38679671
+chr22 38692497 38692809
+chr22 38693841 38693940
+chr22 38724670 38724810
+chr22 38743304 38743608
+chr22 38775944 38776112
+chr22 38780473 38780587
+chr22 38923659 38923880
+chr22 38930358 38932613
+chr22 38945987 38946719
+chr22 38950488 38950527
+chr22 38979832 38980144
+chr22 39009807 39010435
+chr22 39127425 39127712
+chr22 39140503 39140670
+chr22 39152559 39152608
+chr22 39162210 39163191
+chr22 39238454 39238693
+chr22 39276360 39276521
+chr22 39302730 39302919
+chr22 39329629 39329715
+chr22 39382157 39382233
+chr22 39387031 39387209
+chr22 39503818 39504097
+chr22 39535649 39535696
+chr22 39538318 39538438
+chr22 39566335 39566754
+chr22 39601819 39602374
+chr22 39607321 39607494
+chr22 39682840 39683107
+chr22 39694510 39694787
+chr22 39765176 39765209
+chr22 39777392 39777413
+chr22 39818880 39819878
+chr22 39910032 39910389
+chr22 39920937 39920990
+chr22 40049234 40049323
+chr22 40154558 40154910
+chr22 40241000 40241057
+chr22 40250933 40250951
+chr22 40300926 40300948
+chr22 40342301 40342474
+chr22 40381672 40381962
+chr22 40390640 40390905
+chr22 40456252 40456339
+chr22 40466924 40467003
+chr22 40489697 40489988
+chr22 40548874 40549054
+chr22 40555640 40555938
+chr22 40564089 40564316
+chr22 40565264 40565566
+chr22 40583530 40583868
+chr22 40609014 40609308
+chr22 40675843 40676260
+chr22 40724620 40724816
+chr22 40746643 40750204
+chr22 40752400 40752713
+chr22 40763986 40764269
+chr22 40785447 40785718
+chr22 40804827 40804962
+chr22 40866690 40866724
+chr22 40907139 40907439
+chr22 40973568 40973841
+chr22 41020046 41020169
+chr22 41035441 41037057
+chr22 41043402 41043718
+chr22 41070393 41070590
+chr22 41078270 41078559
+chr22 41080666 41081164
+chr22 41112499 41112858
+chr22 41155155 41156001
+chr22 41162074 41162167
+chr22 41170653 41170962
+chr22 41194046 41194461
+chr22 41194469 41194770
+chr22 41228975 41229261
+chr22 41248977 41249073
+chr22 41263358 41263679
+chr22 41292156 41292449
+chr22 41322188 41322485
+chr22 41373767 41374984
+chr22 41381288 41381371
+chr22 41391248 41391615
+chr22 41393474 41393794
+chr22 41476126 41476739
+chr22 41501305 41501436
+chr22 41537165 41537362
+chr22 41580126 41580333
+chr22 41608948 41609235
+chr22 41609805 41610395
+chr22 41627095 41627236
+chr22 41657490 41657816
+chr22 41665380 41665575
+chr22 41681733 41681888
+chr22 41748401 41748538
+chr22 41750259 41750679
+chr22 41755786 41755863
+chr22 41845929 41846042
+chr22 41847508 41847567
+chr22 41899999 41900110
+chr22 41921609 41921908
+chr22 41929935 41930236
+chr22 41941154 41941280
+chr22 41975946 41976360
+chr22 42002215 42002718
+chr22 42005231 42006764
+chr22 42027502 42027574
+chr22 42050660 42051669
+chr22 42120358 42120616
+chr22 42179381 42179533
+chr22 42182635 42183248
+chr22 42195500 42195798
+chr22 42235913 42236205
+chr22 42266108 42266177
+chr22 42296169 42296206
+chr22 42319656 42319704
+chr22 42377967 42378281
+chr22 42382378 42382448
+chr22 42389989 42390105
+chr22 42438732 42439040
+chr22 42485857 42486092
+chr22 42546327 42546490
+chr22 42567382 42567700
+chr22 42569255 42569559
+chr22 42569839 42569874
+chr22 42639345 42639795
+chr22 42645208 42645292
+chr22 42672345 42672869
+chr22 42773961 42774256
+chr22 42781759 42782167
+chr22 42865491 42865654
+chr22 42917791 42917938
+chr22 42927250 42927548
+chr22 42936628 42936917
+chr22 42947698 42947854
+chr22 43060014 43060102
+chr22 43080498 43080794
+chr22 43104863 43105143
+chr22 43112189 43112497
+chr22 43174718 43174906
+chr22 43178323 43178620
+chr22 43231508 43231617
+chr22 43252764 43252858
+chr22 43272796 43272905
+chr22 43308785 43308971
+chr22 43332942 43333109
+chr22 43334753 43334973
+chr22 43344497 43344548
+chr22 43371049 43371149
+chr22 43386207 43386553
+chr22 43392009 43392309
+chr22 43402765 43402891
+chr22 43448222 43448427
+chr22 43454310 43454572
+chr22 43470701 43470800
+chr22 43529412 43529895
+chr22 43641333 43641632
+chr22 43658237 43658514
+chr22 43699097 43699571
+chr22 43702024 43702269
+chr22 43760548 43760644
+chr22 43795819 43796079
+chr22 43797976 43798431
+chr22 43813300 43813583
+chr22 43838443 43838810
+chr22 43881946 43882078
+chr22 43896708 43896825
+chr22 43908194 43908501
+chr22 43949325 43949651
+chr22 43991613 43992002
+chr22 44023228 44023606
+chr22 44066861 44067141
+chr22 44176095 44176625
+chr22 44178440 44178987
+chr22 44179208 44179744
+chr22 44248594 44248729
+chr22 44277244 44277448
+chr22 44303164 44303203
+chr22 44325828 44325975
+chr22 44384484 44384517
+chr22 44491717 44492163
+chr22 44533055 44533084
+chr22 44539477 44540306
+chr22 44571644 44571769
+chr22 44601639 44602423
+chr22 44611877 44611983
+chr22 44636593 44637393
+chr22 44686294 44687686
+chr22 44889303 44889418
+chr22 44952929 44952970
+chr22 44976506 44976622
+chr22 45054881 45055191
+chr22 45108668 45108956
+chr22 45119120 45119165
+chr22 45131211 45131512
+chr22 45152262 45152314
+chr22 45154888 45155129
+chr22 45177846 45178118
+chr22 45213242 45213303
+chr22 45248669 45248761
+chr22 45256746 45256883
+chr22 45294882 45295167
+chr22 45303786 45305778
+chr22 45339430 45339544
+chr22 45361330 45361638
+chr22 45395575 45395860
+chr22 45463640 45465121
+chr22 45480312 45480364
+chr22 45500780 45500917
+chr22 45649087 45649263
+chr22 45703024 45703178
+chr22 45733453 45733770
+chr22 45809446 45809586
+chr22 45821637 45821694
+chr22 45841391 45841467
+chr22 45929001 45929084
+chr22 45940599 45941224
+chr22 45943660 45943756
+chr22 46026200 46026294
+chr22 46215455 46215772
+chr22 46231040 46231520
+chr22 46278901 46279017
+chr22 46283387 46283474
+chr22 46427025 46427250
+chr22 46466255 46466411
+chr22 46466758 46467110
+chr22 46492981 46493261
+chr22 46557637 46557751
+chr22 46558680 46560161
+chr22 46598355 46598406
+chr22 46697290 46697499
+chr22 46898305 46898918
+chr22 46937132 46939802
+chr22 46939805 46940331
+chr22 46941934 46941967
+chr22 46952417 46952523
+chr22 47022222 47024461
+chr22 47079492 47080820
+chr22 47216896 47217189
+chr22 47223222 47223405
+chr22 47224226 47224925
+chr22 47229631 47229853
+chr22 47252417 47252539
+chr22 47269405 47269951
+chr22 47286845 47286920
+chr22 47291620 47291663
+chr22 47317523 47318451
+chr22 47321220 47322381
+chr22 47344533 47344589
+chr22 47402830 47403129
+chr22 47412759 47414595
+chr22 47452535 47452592
+chr22 47525652 47525813
+chr22 47527764 47528163
+chr22 47607182 47607219
+chr22 47675646 47675735
+chr22 47730617 47731226
+chr22 47911324 47911772
+chr22 48025579 48025806
+chr22 48038159 48038282
+chr22 48041169 48041969
+chr22 48047912 48047952
+chr22 48053259 48053598
+chr22 48090884 48091363
+chr22 48120964 48121391
+chr22 48139447 48139638
+chr22 48287924 48288237
+chr22 48328541 48328673
+chr22 48347640 48347780
+chr22 48468978 48469282
+chr22 48538074 48538235
+chr22 48549675 48549711
+chr22 48599737 48599843
+chr22 48654227 48654246
+chr22 48686596 48687483
+chr22 48743750 48743850
+chr22 48928044 48928188
+chr22 48932485 48936541
+chr22 48947287 48947606
+chr22 49028815 49028862
+chr22 49041709 49042030
+chr22 49082496 49082798
+chr22 49266667 49266881
+chr22 49359996 49360197
+chr22 49394192 49394228
+chr22 49440579 49441166
+chr22 49452131 49452334
+chr22 49480362 49480405
+chr22 49505723 49505924
+chr22 49511339 49511715
+chr22 49559138 49559246
+chr22 49569827 49569877
+chr22 49572088 49572126
+chr22 49657329 49657700
+chr22 49759131 49759277
+chr22 49857597 49858201
+chr22 49881398 49881666
+chr22 49915669 49915774
+chr22 49981006 49982892
+chr22 49987648 49988083
+chr22 50074671 50075098
+chr22 50092348 50092651
+chr22 50101513 50101811
+chr22 50214147 50214325
+chr22 50273212 50274222
+chr22 50357359 50357474
+chr22 50386127 50386487
+chr22 50395487 50396695
+chr22 50403486 50403659
+chr22 50501361 50501661
+chr22 50624075 50624449
+chr22 50641027 50641222
+chr22 50675373 50675908
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/hg38.SegDup.chr22.bed b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/hg38.SegDup.chr22.bed
new file mode 100644
index 00000000000..34dae449f4e
--- /dev/null
+++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/hg38.SegDup.chr22.bed
@@ -0,0 +1,229 @@
+chr22 10510000 10716866
+chr22 10717194 10783858
+chr22 10834643 10874572
+chr22 10924572 10966724
+chr22 11016724 11068956
+chr22 11118987 11160921
+chr22 11236613 11375636
+chr22 11428056 11497337
+chr22 11550626 11631288
+chr22 11681288 11724629
+chr22 11774629 11977555
+chr22 12028037 12126656
+chr22 12126660 12225588
+chr22 12275588 12375376
+chr22 12489121 12641730
+chr22 12691730 12693537
+chr22 12694103 12726204
+chr22 12776204 12818137
+chr22 12868137 12904788
+chr22 15200000 15934152
+chr22 16027648 16029124
+chr22 16049831 16268393
+chr22 16302843 16304296
+chr22 16305427 16307046
+chr22 16378028 16568459
+chr22 16568499 16582793
+chr22 16583474 16608423
+chr22 16608436 16610320
+chr22 16610337 16680489
+chr22 16680506 16766359
+chr22 16838852 16843928
+chr22 16850819 16912063
+chr22 16922834 16928158
+chr22 17014088 17038567
+chr22 17214090 17215248
+chr22 17419415 17420675
+chr22 17763709 17765328
+chr22 17984579 17998239
+chr22 18008719 18010388
+chr22 18159723 18239129
+chr22 18339129 18433513
+chr22 18483513 18518714
+chr22 18518836 18659561
+chr22 18709564 18939750
+chr22 18950240 18976730
+chr22 18977700 18981974
+chr22 18999690 19010480
+chr22 19010632 19015209
+chr22 19023255 19035499
+chr22 20141036 20166817
+chr22 20262035 20290267
+chr22 20299686 20301294
+chr22 20301634 20323554
+chr22 20324592 20362643
+chr22 20362648 20377695
+chr22 20388431 20390146
+chr22 20451509 20453240
+chr22 20482745 20484148
+chr22 20667580 20671339
+chr22 20672650 20674606
+chr22 20678347 20699044
+chr22 20700323 20720038
+chr22 20731437 20738049
+chr22 21010365 21018352
+chr22 21042050 21050489
+chr22 21057379 21062735
+chr22 21111384 21151083
+chr22 21151091 21443089
+chr22 21458187 21562827
+chr22 21954496 21958494
+chr22 22114615 22118379
+chr22 22118384 22120610
+chr22 22220568 22224566
+chr22 22249758 22315122
+chr22 22320797 22327981
+chr22 22402552 22410447
+chr22 22620608 22655111
+chr22 22664938 22668785
+chr22 22670912 22699474
+chr22 22699496 22703424
+chr22 22704161 22717672
+chr22 22732715 22761861
+chr22 22761911 22776923
+chr22 22789775 22794414
+chr22 22820054 22823467
+chr22 22897363 22901536
+chr22 22902711 22906902
+chr22 23263953 23264988
+chr22 23306925 23321261
+chr22 23322143 23351708
+chr22 23351727 23357279
+chr22 23358158 23378265
+chr22 23378551 23380944
+chr22 23381237 23402335
+chr22 23402754 23405250
+chr22 23415788 23427393
+chr22 23432025 23461151
+chr22 23464357 23480727
+chr22 23480999 23489014
+chr22 23493585 23499842
+chr22 23499856 23510388
+chr22 23511082 23552138
+chr22 23553526 23577255
+chr22 23594547 23602870
+chr22 23616462 23620198
+chr22 23620202 23631127
+chr22 23631491 23644932
+chr22 23658595 23661000
+chr22 23670225 23679116
+chr22 23734856 23737374
+chr22 23853822 23856326
+chr22 23929519 23932056
+chr22 23939932 23969548
+chr22 23971773 24001392
+chr22 24179310 24184938
+chr22 24236854 24268803
+chr22 24268948 24283958
+chr22 24286747 24301118
+chr22 24310536 24314778
+chr22 24377585 24389259
+chr22 24598467 24684046
+chr22 24686848 24690592
+chr22 24690598 24693136
+chr22 24701278 24703178
+chr22 24764774 24766052
+chr22 25177661 25179556
+chr22 25226889 25247092
+chr22 25253643 25258280
+chr22 25258590 25284823
+chr22 25284953 25313517
+chr22 25314206 25328057
+chr22 25330858 25340072
+chr22 25344989 25348911
+chr22 25350903 25382596
+chr22 25385378 25414514
+chr22 25415392 25426013
+chr22 25426105 25427123
+chr22 25428803 25435365
+chr22 25454748 25471797
+chr22 25502468 25507013
+chr22 25515602 25532252
+chr22 25559573 25569251
+chr22 25575611 25576630
+chr22 25580799 25617150
+chr22 25649024 25652032
+chr22 26560427 26561624
+chr22 28675662 28696392
+chr22 29355386 29357923
+chr22 29360314 29368998
+chr22 29399343 29409561
+chr22 29410818 29411866
+chr22 29411971 29424893
+chr22 29434688 29443954
+chr22 29488845 29491290
+chr22 29608860 29609951
+chr22 29880407 29902476
+chr22 30543830 30544914
+chr22 30655294 30657152
+chr22 30902152 30904579
+chr22 31057439 31062134
+chr22 31065390 31078993
+chr22 31204482 31207013
+chr22 32111378 32113026
+chr22 32130965 32141884
+chr22 32148444 32149492
+chr22 32150491 32158285
+chr22 32159300 32162703
+chr22 32170391 32172935
+chr22 32175877 32184749
+chr22 32187175 32197389
+chr22 32271774 32273163
+chr22 32273176 32279646
+chr22 32279657 32329510
+chr22 32331229 32341624
+chr22 32341631 32344607
+chr22 32344611 32346315
+chr22 32357195 32363983
+chr22 36127120 36129313
+chr22 36534514 36537489
+chr22 38355621 38401450
+chr22 38957757 38964074
+chr22 38982097 38988028
+chr22 38988116 38993669
+chr22 39020021 39066093
+chr22 39069234 39080941
+chr22 39520561 39522683
+chr22 39875503 39876516
+chr22 40824594 40827232
+chr22 41073572 41075248
+chr22 41433493 41434823
+chr22 41708403 41709587
+chr22 41901063 41902969
+chr22 42000444 42002033
+chr22 42045815 42047270
+chr22 42123191 42132193
+chr22 42135343 42145873
+chr22 42149884 42155241
+chr22 42446192 42450718
+chr22 42479610 42489336
+chr22 42500173 42520238
+chr22 42523089 42528444
+chr22 42529035 42536680
+chr22 42553733 42559454
+chr22 42566420 42582450
+chr22 42775905 42777359
+chr22 43032309 43040224
+chr22 43166623 43168160
+chr22 43644840 43645853
+chr22 44112740 44114031
+chr22 44114040 44115404
+chr22 44222726 44266377
+chr22 44266598 44277214
+chr22 44566935 44568316
+chr22 44858020 44860015
+chr22 44865612 44867631
+chr22 45184450 45188011
+chr22 46139130 46141577
+chr22 46479626 46481736
+chr22 46615032 46616157
+chr22 46616221 46617437
+chr22 48618772 48620136
+chr22 48620180 48621405
+chr22 48901510 48903279
+chr22 48910475 48911620
+chr22 48911634 48912886
+chr22 49383944 49385910
+chr22 49386637 49388496
+chr22 50432257 50442552
+chr22 50740515 50808468
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config.tsv
new file mode 100644
index 00000000000..6ca199c9600
--- /dev/null
+++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config.tsv
@@ -0,0 +1,7 @@
+NAME SVTYPE MIN_SIZE MAX_SIZE TRACKS
+INS_small_SD INS -1 -1 SD
+DEL_50_5k_both DEL 50 5000 SD,RM
+DEL_5k_50k_SD DEL 5000 50000 SD
+DUP_lt5kb_RM DUP 5000 RM
+INV_gt1kb INV 1000 -1 NULL
+BND_SD BND -1 -1 SD
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config_duplicate.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config_duplicate.tsv
new file mode 100644
index 00000000000..417167668b7
--- /dev/null
+++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config_duplicate.tsv
@@ -0,0 +1,8 @@
+NAME SVTYPE MIN_SIZE MAX_SIZE TRACKS
+INS_small_SD INS -1 -1 SD
+DEL_50_5k_dup DEL 50 5000 SD
+DEL_50_5k_dup DEL 50 5000 RM
+DEL_5k_50k_SD DEL 5000 50000 SD
+DUP_lt5kb_RM DUP 5000 RM
+INV_gt1kb INV 1000 -1 NULL
+BND_SD BND -1 -1 SD
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config_redundant.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config_redundant.tsv
new file mode 100644
index 00000000000..990df0d9458
--- /dev/null
+++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVStratify/test_config_redundant.tsv
@@ -0,0 +1,8 @@
+NAME SVTYPE MIN_SIZE MAX_SIZE TRACKS
+INS_small_SD INS -1 -1 SD
+DEL_50_5k_SD DEL 50 5000 SD
+DEL_50_5k_RM DEL 50 5000 RM
+DEL_5k_50k_SD DEL 5000 50000 SD
+DUP_lt5kb_RM DUP 5000 RM
+INV_gt1kb INV 1000 -1 NULL
+BND_SD BND -1 -1 SD