Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issue in read_bam implementation, it got the wrong contig name #206

Merged
merged 6 commits into from
May 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bioframe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
"fetch_centromeres",
"fetch_chromsizes",
"load_fasta",
"read_alignment",
"read_bam",
"read_bigbed",
"read_bigwig",
Expand Down Expand Up @@ -117,6 +118,7 @@
fetch_centromeres,
fetch_chromsizes,
load_fasta,
read_alignment,
read_bam,
read_bigbed,
read_bigwig,
Expand Down
2 changes: 2 additions & 0 deletions bioframe/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from .bed import to_bed
from .fileops import (
load_fasta,
read_alignment,
read_bam,
read_bigbed,
read_bigwig,
Expand All @@ -23,6 +24,7 @@
"read_tabix",
"read_pairix",
"read_bam",
"read_alignment",
"load_fasta",
"read_bigwig",
"to_bed",
Expand Down
61 changes: 41 additions & 20 deletions bioframe/io/fileops.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import array
import io
import json
import os
Expand Down Expand Up @@ -231,35 +232,55 @@ def read_pairix(
return df


def read_bam(fp, chrom=None, start=None, end=None):
def read_alignment(fp, chrom=None, start=None, end=None):
"""
Read bam records into a DataFrame.
Read alignment records into a DataFrame.
"""
import pysam

with closing(pysam.AlignmentFile(fp, "rb")) as f:
bam_iter = f.fetch(chrom, start, end)
records = [
(
s.qname,
s.flag,
s.rname,
s.pos,
s.mapq,
s.cigarstring if s.mapq != 0 else np.nan,
s.rnext,
s.pnext,
s.tlen,
s.seq,
s.qual,
json.dumps(OrderedDict(s.tags)),
ext = os.path.splitext(fp)[1]
if ext == '.sam':
mode = 'r'
elif ext == '.bam':
mode = 'rb'
elif ext == '.cram':
mode = 'rc'
else:
raise ValueError(f'{ext} is not a supported filetype')

with closing(pysam.AlignmentFile(fp, mode)) as f:
records = []
for s in f.fetch(chrom, start, end):
# Needed because array.array is not json serializable
tags = [(k, v.tolist() if type(v) == array.array else v) for k, v in s.tags]
records.append(
(
s.qname,
s.flag,
s.reference_name,
s.pos,
s.mapq,
s.cigarstring if s.mapq != 0 else np.nan,
s.rnext,
s.pnext,
s.tlen,
s.seq,
s.qual,
json.dumps(dict(tags)),
)
)
for s in bam_iter
]
df = pd.DataFrame(records, columns=BAM_FIELDS)
return df


def read_bam(fp, chrom=None, start=None, end=None):
"""
Deprecated: use `read_alignment` instead.
Read bam file into dataframe,
"""
return read_alignment(fp, chrom, start, end)


class PysamFastaRecord:
def __init__(self, ff, ref):
self.ff = ff
Expand Down
Binary file added tests/test_data/toy.bam
Binary file not shown.
Binary file added tests/test_data/toy.bam.bai
Binary file not shown.
14 changes: 14 additions & 0 deletions tests/test_data/toy.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
@SQ SN:ref LN:45
@SQ SN:ref2 LN:40
r001 163 ref 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112
r002 0 ref 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA *
r003 0 ref 9 30 5H6M * 0 0 AGCTAA *
r004 0 ref 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC *
r003 16 ref 29 30 6H5M * 0 0 TAGGC *
r001 83 ref 37 30 9M = 7 -39 CAGCGCCAT *
x1 0 ref2 1 30 20M * 0 0 aggttttataaaacaaataa ????????????????????
x2 0 ref2 2 30 21M * 0 0 ggttttataaaacaaataatt ?????????????????????
x3 0 ref2 6 30 9M4I13M * 0 0 ttataaaacAAATaattaagtctaca ??????????????????????????
x4 0 ref2 10 30 25M * 0 0 CaaaTaattaagtctacagagcaac ?????????????????????????
x5 0 ref2 12 30 24M * 0 0 aaTaattaagtctacagagcaact ????????????????????????
x6 0 ref2 14 30 23M * 0 0 Taattaagtctacagagcaacta ???????????????????????
11 changes: 11 additions & 0 deletions tests/test_fileops.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,14 @@ def test_read_beds():
for schema in schemas:
_ = bioframe.read_table(f'tests/test_data/{schema}.bed', schema=schema,
schema_is_strict=True)


def test_read_sam():
# SAM file taken from https://github.com/samtools/samtools/blob/develop/examples/toy.sam
_ = bioframe.read_alignment('tests/test_data/toy.sam')


def test_read_bam():
# converted toy.sam via `samtools view -bS toy.sam > toy.bam;
# index file created with `samtools index toy.bam`
_ = bioframe.read_alignment('tests/test_data/toy.bam')