diff --git a/settings.gradle b/settings.gradle index fbce9bd..62fdb04 100644 --- a/settings.gradle +++ b/settings.gradle @@ -1,3 +1,3 @@ rootProject.name = 'bioio' -include 'core', 'bed', 'gff', 'fasta', 'pedigree', 'pdb', 'chain', 'vcf', 'genbank', 'turtle', 'bgee', 'text' +include 'core', 'bed', 'gff', 'fasta', 'pedigree', 'chain', 'vcf', 'vff', 'genbank', 'turtle', 'bgee', 'text' diff --git a/vff/build.gradle b/vff/build.gradle new file mode 100644 index 0000000..75740b8 --- /dev/null +++ b/vff/build.gradle @@ -0,0 +1,3 @@ +dependencies { + api project(':core') +} diff --git a/vff/src/main/java/org/pharmgkb/parsers/vff/VffParser.java b/vff/src/main/java/org/pharmgkb/parsers/vff/VffParser.java new file mode 100644 index 0000000..288236d --- /dev/null +++ b/vff/src/main/java/org/pharmgkb/parsers/vff/VffParser.java @@ -0,0 +1,68 @@ +package org.pharmgkb.parsers.vff; + +import org.pharmgkb.parsers.LineParser; +import org.pharmgkb.parsers.vff.model.VffEntry; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nonnegative; +import javax.annotation.Nonnull; +import javax.annotation.concurrent.ThreadSafe; +import java.lang.invoke.MethodHandles; +import java.util.concurrent.atomic.AtomicLong; + +@ThreadSafe +public class VffParser implements VffParserI { + + private static final long sf_logEvery = 10000; + private static final Pattern sf_slash = Pattern.compile("/"); + private static final Pattern sf_tab = Pattern.compile("\t"); + + private static final Logger sf_logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private final AtomicLong m_lineNumber = new AtomicLong(0L); + + @Nonnull + @Override + public VffEntry apply(@Nonnull String line) throws BadDataFormatException { + + if (m_lineNumber.incrementAndGet() % sf_logEvery == 0) { + sf_logger.debug("Reading line #{}", m_lineNumber); + } + + String[] parts = sf_tab.split(line); + String[] alleles = sf_slash.split(parts[3]); + try { + return new VffEntry( + new StandardChromosomeName(parts[0]), + Integer.parseInt(parts[1]) - 1, + Integer.parseInt(parts[2]) - 1, + alleles[0], + alleles[1], + Strand.fromSymbol(parts[4]) + ) + + } catch (IllegalArgumentException | ArrayIndexOutOfBoundsException e) { + throw new BadDataFormatException("Bad data format on line #" + m_lineNumber.get() + + "; line is [[[" + line + "]]]", e); + } catch (RuntimeException e) { + // this is a little weird, but it's helpful + // not that we're not throwing a BadDataFormatException because we don't expect AIOOB, e.g. + e.addSuppressed(new RuntimeException("Unexpectedly failed to parse line " + m_lineNumber)); + throw e; + } + } + + @Nonnegative + @Override + public long nLinesProcessed() { + return m_lineNumber.get(); + } + + @Override + public String toString() { + return new StringJoiner(", ", VffParser.class.getSimpleName() + "[", "]") + .add("lineNumber=" + m_lineNumber) + .toString(); + } +} diff --git a/vff/src/main/java/org/pharmgkb/parsers/vff/VffParserI.java b/vff/src/main/java/org/pharmgkb/parsers/vff/VffParserI.java new file mode 100644 index 0000000..0cc15cb --- /dev/null +++ b/vff/src/main/java/org/pharmgkb/parsers/vff/VffParserI.java @@ -0,0 +1,8 @@ +package org.pharmgkb.parsers.vff; + +import org.pharmgkb.parsers.LineParser; +import org.pharmgkb.parsers.vff.model.VffEntry; + +public interface VffParserI extends LineParser { + +} diff --git a/vff/src/main/java/org/pharmgkb/parsers/vff/VffWriter.java b/vff/src/main/java/org/pharmgkb/parsers/vff/VffWriter.java new file mode 100644 index 0000000..641f09a --- /dev/null +++ b/vff/src/main/java/org/pharmgkb/parsers/vff/VffWriter.java @@ -0,0 +1,44 @@ +package org.pharmgkb.parsers.vff; + +import org.pharmgkb.parsers.LineWriter; +import org.pharmgkb.parsers.vff.model.VffEntry; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nonnegative; +import javax.annotation.Nonnull; +import javax.annotation.concurrent.ThreadSafe; +import java.lang.invoke.MethodHandles; +import java.util.concurrent.atomic.AtomicLong; + +@ThreadSafe +public class VffWriter implements LineWriter, VffWriterI { + + private static final long sf_logEvery = 10000; + private static final Logger sf_logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private final AtomicLong m_lineNumber = new AtomicLong(0L); + + @Nonnull + @Override + public String apply(@Nonnull VffEntry entry) { + return entry.chromosome().original() + "\t" + + (entry.start() + 1) + "\t" + + (entry.end() + 1) + "\t" + + entry.refAllele() + "/" + entry.variantAllele() + "\t" + + entry.strand().symbol; + } + + @Nonnegative + @Override + public long nLinesProcessed() { + return m_lineNumber.get(); + } + + @Override + public String toString() { + return "VffWriter{" + + "lineNumber=" + m_lineNumber.get() + + '}'; + } +} diff --git a/vff/src/main/java/org/pharmgkb/parsers/vff/VffWriterI.java b/vff/src/main/java/org/pharmgkb/parsers/vff/VffWriterI.java new file mode 100644 index 0000000..73ed725 --- /dev/null +++ b/vff/src/main/java/org/pharmgkb/parsers/vff/VffWriterI.java @@ -0,0 +1,5 @@ +package org.pharmgkb.parsers.vff; + +public interface VffWriterI { + +} diff --git a/vff/src/main/java/org/pharmgkb/parsers/vff/model/VffEntry.java b/vff/src/main/java/org/pharmgkb/parsers/vff/model/VffEntry.java new file mode 100644 index 0000000..08de1ee --- /dev/null +++ b/vff/src/main/java/org/pharmgkb/parsers/vff/model/VffEntry.java @@ -0,0 +1,36 @@ +package org.pharmgkb.parsers.vff.model; + +import org.pharmgkb.parsers.core.model.NucleotideCode; + +import javax.annotation.Nonnegative; +import javax.annotation.Nonnull; + +public record VffEntry( + @Nonnull ChromosomeName chromosome, + @Nonnegative int start, + @Nonnegative int end, + @Nonnull String refAllele, + @Nonnull String variantAllele, + @Nonnull Strand strand +) { + + @Nonnull + public List refBases() { + return refAllele.chars().mapToObj(c -> String.valueOf(c)).map(NucleotideCode::valueOf).toList(); + } + + @Nonnull + public List variantBases() { + return variantAllele.chars().mapToObj(c -> String.valueOf(c)).map(NucleotideCode::valueOf).toList(); + } + + @Nonnull + public List insertedBases() { + return start == end + 1 ? variantBases() : List.empty(); + } + + @Nonnull + public List deleted() { + return start == end - 2 ? refBases() : List.empty(); + } +}