forked from PharmGKB/genome-sequence-io
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b87d667
commit ebf8d28
Showing
7 changed files
with
165 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
rootProject.name = 'bioio' | ||
|
||
include 'core', 'bed', 'gff', 'fasta', 'pedigree', 'pdb', 'chain', 'vcf', 'genbank', 'turtle', 'bgee', 'text' | ||
include 'core', 'bed', 'gff', 'fasta', 'pedigree', 'chain', 'vcf', 'vff', 'genbank', 'turtle', 'bgee', 'text' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
dependencies { | ||
api project(':core') | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
package org.pharmgkb.parsers.vff; | ||
|
||
import org.pharmgkb.parsers.LineParser; | ||
import org.pharmgkb.parsers.vff.model.VffEntry; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import javax.annotation.Nonnegative; | ||
import javax.annotation.Nonnull; | ||
import javax.annotation.concurrent.ThreadSafe; | ||
import java.lang.invoke.MethodHandles; | ||
import java.util.concurrent.atomic.AtomicLong; | ||
|
||
@ThreadSafe | ||
public class VffParser implements VffParserI { | ||
|
||
private static final long sf_logEvery = 10000; | ||
private static final Pattern sf_slash = Pattern.compile("/"); | ||
private static final Pattern sf_tab = Pattern.compile("\t"); | ||
|
||
private static final Logger sf_logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); | ||
|
||
private final AtomicLong m_lineNumber = new AtomicLong(0L); | ||
|
||
@Nonnull | ||
@Override | ||
public VffEntry apply(@Nonnull String line) throws BadDataFormatException { | ||
|
||
if (m_lineNumber.incrementAndGet() % sf_logEvery == 0) { | ||
sf_logger.debug("Reading line #{}", m_lineNumber); | ||
} | ||
|
||
String[] parts = sf_tab.split(line); | ||
String[] alleles = sf_slash.split(parts[3]); | ||
try { | ||
return new VffEntry( | ||
new StandardChromosomeName(parts[0]), | ||
Integer.parseInt(parts[1]) - 1, | ||
Integer.parseInt(parts[2]) - 1, | ||
alleles[0], | ||
alleles[1], | ||
Strand.fromSymbol(parts[4]) | ||
) | ||
|
||
} catch (IllegalArgumentException | ArrayIndexOutOfBoundsException e) { | ||
throw new BadDataFormatException("Bad data format on line #" + m_lineNumber.get() | ||
+ "; line is [[[" + line + "]]]", e); | ||
} catch (RuntimeException e) { | ||
// this is a little weird, but it's helpful | ||
// not that we're not throwing a BadDataFormatException because we don't expect AIOOB, e.g. | ||
e.addSuppressed(new RuntimeException("Unexpectedly failed to parse line " + m_lineNumber)); | ||
throw e; | ||
} | ||
} | ||
|
||
@Nonnegative | ||
@Override | ||
public long nLinesProcessed() { | ||
return m_lineNumber.get(); | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return new StringJoiner(", ", VffParser.class.getSimpleName() + "[", "]") | ||
.add("lineNumber=" + m_lineNumber) | ||
.toString(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
package org.pharmgkb.parsers.vff; | ||
|
||
import org.pharmgkb.parsers.LineParser; | ||
import org.pharmgkb.parsers.vff.model.VffEntry; | ||
|
||
public interface VffParserI extends LineParser<VffEntry> { | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
package org.pharmgkb.parsers.vff; | ||
|
||
import org.pharmgkb.parsers.LineWriter; | ||
import org.pharmgkb.parsers.vff.model.VffEntry; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import javax.annotation.Nonnegative; | ||
import javax.annotation.Nonnull; | ||
import javax.annotation.concurrent.ThreadSafe; | ||
import java.lang.invoke.MethodHandles; | ||
import java.util.concurrent.atomic.AtomicLong; | ||
|
||
@ThreadSafe | ||
public class VffWriter implements LineWriter<VffEntry>, VffWriterI { | ||
|
||
private static final long sf_logEvery = 10000; | ||
private static final Logger sf_logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); | ||
|
||
private final AtomicLong m_lineNumber = new AtomicLong(0L); | ||
|
||
@Nonnull | ||
@Override | ||
public String apply(@Nonnull VffEntry entry) { | ||
return entry.chromosome().original() + "\t" | ||
+ (entry.start() + 1) + "\t" | ||
+ (entry.end() + 1) + "\t" | ||
+ entry.refAllele() + "/" + entry.variantAllele() + "\t" | ||
+ entry.strand().symbol; | ||
} | ||
|
||
@Nonnegative | ||
@Override | ||
public long nLinesProcessed() { | ||
return m_lineNumber.get(); | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return "VffWriter{" + | ||
"lineNumber=" + m_lineNumber.get() + | ||
'}'; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
package org.pharmgkb.parsers.vff; | ||
|
||
public interface VffWriterI { | ||
|
||
} |
36 changes: 36 additions & 0 deletions
36
vff/src/main/java/org/pharmgkb/parsers/vff/model/VffEntry.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
package org.pharmgkb.parsers.vff.model; | ||
|
||
import org.pharmgkb.parsers.core.model.NucleotideCode; | ||
|
||
import javax.annotation.Nonnegative; | ||
import javax.annotation.Nonnull; | ||
|
||
public record VffEntry( | ||
@Nonnull ChromosomeName chromosome, | ||
@Nonnegative int start, | ||
@Nonnegative int end, | ||
@Nonnull String refAllele, | ||
@Nonnull String variantAllele, | ||
@Nonnull Strand strand | ||
) { | ||
|
||
@Nonnull | ||
public List<NucleotideCode> refBases() { | ||
return refAllele.chars().mapToObj(c -> String.valueOf(c)).map(NucleotideCode::valueOf).toList(); | ||
} | ||
|
||
@Nonnull | ||
public List<NucleotideCode> variantBases() { | ||
return variantAllele.chars().mapToObj(c -> String.valueOf(c)).map(NucleotideCode::valueOf).toList(); | ||
} | ||
|
||
@Nonnull | ||
public List<NucleotideCode> insertedBases() { | ||
return start == end + 1 ? variantBases() : List.empty(); | ||
} | ||
|
||
@Nonnull | ||
public List<NucleotideCode> deleted() { | ||
return start == end - 2 ? refBases() : List.empty(); | ||
} | ||
} |