Skip to content

Commit

Permalink
add VFF
Browse files Browse the repository at this point in the history
  • Loading branch information
dmyersturnbull committed Jan 22, 2024
1 parent b87d667 commit ebf8d28
Show file tree
Hide file tree
Showing 7 changed files with 165 additions and 1 deletion.
2 changes: 1 addition & 1 deletion settings.gradle
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
rootProject.name = 'bioio'

include 'core', 'bed', 'gff', 'fasta', 'pedigree', 'pdb', 'chain', 'vcf', 'genbank', 'turtle', 'bgee', 'text'
include 'core', 'bed', 'gff', 'fasta', 'pedigree', 'chain', 'vcf', 'vff', 'genbank', 'turtle', 'bgee', 'text'
3 changes: 3 additions & 0 deletions vff/build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
dependencies {
api project(':core')
}
68 changes: 68 additions & 0 deletions vff/src/main/java/org/pharmgkb/parsers/vff/VffParser.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package org.pharmgkb.parsers.vff;

import org.pharmgkb.parsers.LineParser;
import org.pharmgkb.parsers.vff.model.VffEntry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nonnegative;
import javax.annotation.Nonnull;
import javax.annotation.concurrent.ThreadSafe;
import java.lang.invoke.MethodHandles;
import java.util.concurrent.atomic.AtomicLong;

@ThreadSafe
public class VffParser implements VffParserI {

private static final long sf_logEvery = 10000;
private static final Pattern sf_slash = Pattern.compile("/");
private static final Pattern sf_tab = Pattern.compile("\t");

private static final Logger sf_logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

private final AtomicLong m_lineNumber = new AtomicLong(0L);

@Nonnull
@Override
public VffEntry apply(@Nonnull String line) throws BadDataFormatException {

if (m_lineNumber.incrementAndGet() % sf_logEvery == 0) {
sf_logger.debug("Reading line #{}", m_lineNumber);
}

String[] parts = sf_tab.split(line);
String[] alleles = sf_slash.split(parts[3]);
try {
return new VffEntry(
new StandardChromosomeName(parts[0]),
Integer.parseInt(parts[1]) - 1,
Integer.parseInt(parts[2]) - 1,
alleles[0],
alleles[1],
Strand.fromSymbol(parts[4])
)

} catch (IllegalArgumentException | ArrayIndexOutOfBoundsException e) {
throw new BadDataFormatException("Bad data format on line #" + m_lineNumber.get()
+ "; line is [[[" + line + "]]]", e);
} catch (RuntimeException e) {
// this is a little weird, but it's helpful
// not that we're not throwing a BadDataFormatException because we don't expect AIOOB, e.g.
e.addSuppressed(new RuntimeException("Unexpectedly failed to parse line " + m_lineNumber));
throw e;
}
}

@Nonnegative
@Override
public long nLinesProcessed() {
return m_lineNumber.get();
}

@Override
public String toString() {
return new StringJoiner(", ", VffParser.class.getSimpleName() + "[", "]")
.add("lineNumber=" + m_lineNumber)
.toString();
}
}
8 changes: 8 additions & 0 deletions vff/src/main/java/org/pharmgkb/parsers/vff/VffParserI.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package org.pharmgkb.parsers.vff;

import org.pharmgkb.parsers.LineParser;
import org.pharmgkb.parsers.vff.model.VffEntry;

public interface VffParserI extends LineParser<VffEntry> {

}
44 changes: 44 additions & 0 deletions vff/src/main/java/org/pharmgkb/parsers/vff/VffWriter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package org.pharmgkb.parsers.vff;

import org.pharmgkb.parsers.LineWriter;
import org.pharmgkb.parsers.vff.model.VffEntry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nonnegative;
import javax.annotation.Nonnull;
import javax.annotation.concurrent.ThreadSafe;
import java.lang.invoke.MethodHandles;
import java.util.concurrent.atomic.AtomicLong;

@ThreadSafe
public class VffWriter implements LineWriter<VffEntry>, VffWriterI {

private static final long sf_logEvery = 10000;
private static final Logger sf_logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

private final AtomicLong m_lineNumber = new AtomicLong(0L);

@Nonnull
@Override
public String apply(@Nonnull VffEntry entry) {
return entry.chromosome().original() + "\t"
+ (entry.start() + 1) + "\t"
+ (entry.end() + 1) + "\t"
+ entry.refAllele() + "/" + entry.variantAllele() + "\t"
+ entry.strand().symbol;
}

@Nonnegative
@Override
public long nLinesProcessed() {
return m_lineNumber.get();
}

@Override
public String toString() {
return "VffWriter{" +
"lineNumber=" + m_lineNumber.get() +
'}';
}
}
5 changes: 5 additions & 0 deletions vff/src/main/java/org/pharmgkb/parsers/vff/VffWriterI.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package org.pharmgkb.parsers.vff;

public interface VffWriterI {

}
36 changes: 36 additions & 0 deletions vff/src/main/java/org/pharmgkb/parsers/vff/model/VffEntry.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package org.pharmgkb.parsers.vff.model;

import org.pharmgkb.parsers.core.model.NucleotideCode;

import javax.annotation.Nonnegative;
import javax.annotation.Nonnull;

public record VffEntry(
@Nonnull ChromosomeName chromosome,
@Nonnegative int start,
@Nonnegative int end,
@Nonnull String refAllele,
@Nonnull String variantAllele,
@Nonnull Strand strand
) {

@Nonnull
public List<NucleotideCode> refBases() {
return refAllele.chars().mapToObj(c -> String.valueOf(c)).map(NucleotideCode::valueOf).toList();
}

@Nonnull
public List<NucleotideCode> variantBases() {
return variantAllele.chars().mapToObj(c -> String.valueOf(c)).map(NucleotideCode::valueOf).toList();
}

@Nonnull
public List<NucleotideCode> insertedBases() {
return start == end + 1 ? variantBases() : List.empty();
}

@Nonnull
public List<NucleotideCode> deleted() {
return start == end - 2 ? refBases() : List.empty();
}
}

0 comments on commit ebf8d28

Please sign in to comment.