Skip to content

Commit

Permalink
Updates for v1.0.2 (#10)
Browse files Browse the repository at this point in the history
* setup initial tests

* test: updated testing actions and test structure

* feat: added in some tests for utils

* feat: added in function for reading in population panel files separately

* draft: updated some tests

* test: population panel testing

* feat: new function to add in population handling

* test: added in test for verification of samples

* feat: vcf2frequency table now uses cyvcf2

* draft: small commit - still needs some fixing on docs

* test: some refactoring and some testing

* doc: some additional docstrings

* feat: updated to include data in installation

* feat: finally got cyvcf2 features working

* feat: not supporting windows due to cyvcf2

* feat: added update to make custom lists a little bit easier

* doc: cleaned up a lot of the original documentation

* fix: initial attempt to address issue #8

* test: updated tests for utilities to be more comprehensive

* test: added in a test for gzipped frequency files

* ci: updated some of the github CI parameters

* Gzip streaming + additional tests (#9)

* Dev (#6)

* setup initial tests

* test: updated testing actions and test structure

* feat: added in some tests for utils

* feat: added in function for reading in population panel files separately

* draft: updated some tests

* test: population panel testing

* feat: new function to add in population handling

* test: added in test for verification of samples

* feat: vcf2frequency table now uses cyvcf2

* draft: small commit - still needs some fixing on docs

* test: some refactoring and some testing

* doc: some additional docstrings

* feat: updated to include data in installation

* feat: finally got cyvcf2 features working

* feat: not supporting windows due to cyvcf2

* feat: added update to make custom lists a little bit easier

* doc: cleaned up a lot of the original documentation

* fix: initial attempt to address issue #8

* test: updated tests for utilities to be more comprehensive

* test: added in a test for gzipped frequency files

* ci: updated some of the github CI parameters

* fix: updated version in setup.cfg

* ci: changes

* Revert "ci: changes"

This reverts commit 19c22ee.

* fix: added in new commit file due to exclusion

* ci: updated precommit and flake8 compliance

* ci: removed flake8 from ci
  • Loading branch information
aabiddanda authored Jan 29, 2023
1 parent 1703ff0 commit 7d8084b
Show file tree
Hide file tree
Showing 13 changed files with 172 additions and 92 deletions.
6 changes: 1 addition & 5 deletions .github/workflows/macos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:

strategy:
matrix:
python-version: [3.6, 3.7, 3.8, 3.9]
python-version: [3.7, 3.9, 3.11]

steps:
- uses: actions/checkout@v2
Expand All @@ -36,7 +36,3 @@ jobs:
- name: run tests
run: |
python -m pytest tests/
- name: run flake8
run: |
flake8
6 changes: 1 addition & 5 deletions .github/workflows/ubuntu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:

strategy:
matrix:
python-version: [3.6, 3.7, 3.8, 3.9]
python-version: [3.7, 3.9, 3.11]

steps:
- uses: actions/checkout@v2
Expand All @@ -36,7 +36,3 @@ jobs:
- name: run tests
run: |
python -m pytest tests/
- name: run flake8
run: |
flake8
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ geovar/data/*.csv
docsrc/_build/*
.hypothesis/
*.egg-info/
*.coverage
*.coverage*
*.python-version
build/
45 changes: 25 additions & 20 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,23 +1,28 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.4.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
args: ['--maxkb=900']
- repo: https://github.com/psf/black
rev: 19.3b0
hooks:
- id: black
- repo: https://github.com/pycqa/pydocstyle
rev: 4.0.0 # pick a git hash / tag to point to
hooks:
- id: pydocstyle
- repo: https://gitlab.com/pycqa/flake8
rev: 3.7.9
hooks:
- id: flake8
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.4.0
hooks:
- id: trailing-whitespace
exclude: '^docs*/'
- id: end-of-file-fixer
exclude: '^docs/'
- id: check-yaml
- id: check-added-large-files
args: ['--maxkb=900']
- repo: https://github.com/psf/black
rev: 22.12.0
hooks:
- id: black
exclude: '^docs*/'
- repo: https://github.com/pycqa/pydocstyle
rev: 4.0.0 # pick a git hash / tag to point to
hooks:
- id: pydocstyle
exclude: '^docs*/'
- repo: https://github.com/pycqa/flake8
rev: 3.7.9
hooks:
- id: flake8
exclude: '^docs*/'
12 changes: 3 additions & 9 deletions docsrc/stubs/geovar.GeoVar.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,17 @@

.. autoclass:: GeoVar


.. automethod:: __init__


.. rubric:: Methods

.. autosummary::

~GeoVar.__init__
~GeoVar.add_freq_mat
~GeoVar.count_geovar_codes
~GeoVar.generate_bins
~GeoVar.geovar_binning
~GeoVar.geovar_codes_streaming






12 changes: 3 additions & 9 deletions docsrc/stubs/geovar.GeoVarPlot.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@

.. autoclass:: GeoVarPlot


.. automethod:: __init__


.. rubric:: Methods

.. autosummary::

~GeoVarPlot.__init__
~GeoVarPlot.add_cmap
~GeoVarPlot.add_data_geovar
Expand All @@ -26,9 +26,3 @@
~GeoVarPlot.reorder_pops
~GeoVarPlot.set_colors
~GeoVarPlot.sort_geodist






25 changes: 6 additions & 19 deletions docsrc/stubs/geovar.utils.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,17 @@

.. automodule:: geovar.utils












.. rubric:: Functions

.. autosummary::

read_pop_panel
sep_freq_mat_pops
vcf_to_freq_table
verify_sample_indices













53 changes: 31 additions & 22 deletions geovar/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import gzip
from .utils import sep_freq_mat_pops


Expand Down Expand Up @@ -44,7 +46,7 @@ def add_freq_mat(self, freq_mat_file):
(see example notebook for formatting).
"""
af_df = pd.read_table(freq_mat_file, sep=r"\s")
af_df = pd.read_table(freq_mat_file, sep=r"\s", engine="python")
pops, freq_mat = sep_freq_mat_pops(af_df)
self.pops = pops
self.freq_mat = freq_mat
Expand All @@ -58,14 +60,16 @@ def generate_bins(self, bins=[(0, 0), (0, 0.05), (0.05, 1.0)]):
bins (:obj:`list`): list of tuples specifying bins of allele frequency.
"""
assert np.all(np.array(bins) < 1.0)
b = 0.0
new_bins = []
for x in bins:
new_bins.append((b, x))
b = x
new_bins.append((b, 1.0))
self.bins = new_bins
assert np.all(np.array(bins) <= 1.0)
assert np.all(np.array(bins) >= 0.0)
min_val = 1.0
max_val = 0.0
for (start, end) in bins:
min_val = min(min_val, start)
max_val = max(max_val, end)
assert min_val >= 0
assert max_val <= 1
self.bins = bins

def geovar_binning(self):
"""Compute the GeoVar codes for each variant across each population."""
Expand All @@ -91,25 +95,30 @@ def geovar_codes_streaming(self, freq_mat_file):
"""Version of GeoVar code generation algorithm that streams through file to avoid memory overflow.
Args:
freq_mat_file (:obj:`string`): filepath to
frequency table file (see example notebook for formatting).
freq_mat_file (:obj:`string`): filepath to a frequency table file (see example notebook for formatting).
"""
assert self.bins is not None
freq_mat_fp = Path(freq_mat_file)
assert freq_mat_fp.is_file()
geovar_codes = []
# Setting up the testing bins
test_bins = np.array([x[1] for x in self.bins])
with open(freq_mat_file, "r") as f:
header = f.readline()
# Take the population labels currently
pops = np.array(header.split()[6:])
self.pops = pops
for line in tqdm(f):
# Split after the 6th column ...
maf_vector = np.array(line.split()[6:]).astype(np.float64)
cur_geovar = np.digitize(maf_vector, test_bins, right=True)
cur_geovar_code = "".join([str(i) for i in cur_geovar])
geovar_codes.append(cur_geovar_code)
if ".gz" in freq_mat_fp.suffixes:
f = gzip.open(freq_mat_fp, "rt")
else:
f = open(freq_mat_fp, "r")
header = f.readline()
# Take the population labels currently
pops = np.array(header.split()[6:])
self.pops = pops
for line in tqdm(f):
# Split after the 6th column ...
maf_vector = np.array(line.split()[6:]).astype(np.float64)
cur_geovar = np.digitize(maf_vector, test_bins, right=True)
cur_geovar_code = "".join([str(i) for i in cur_geovar])
geovar_codes.append(cur_geovar_code)
f.close()
# Setting the variables here
self.geovar_codes = np.array(geovar_codes)
self.n_variants = self.geovar_codes.size
Expand Down
Binary file not shown.
2 changes: 1 addition & 1 deletion geovar/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def vcf_to_freq_table(vcf_file, pop_df, outfile=None, minor_allele=True, **kwarg
"""
vcf_filepath = Path(vcf_file)
if not vcf_filepath.is_file():
raise ValueError(f"{vcf_file} is not a valid VCF file!")
raise FileNotFoundError(f"{vcf_file} is not a valid VCF file!")
vcf = VCF(vcf_filepath, **kwargs)
unique_pops, pop_idx_dict, pop_dict = verify_sample_indices(pop_df, vcf.samples)
chrom = []
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = geovar
version = 1.0.1
version = 1.0.2
author = Arjun Biddanda
description = A library to generate plots of joint allele frequency variation
long_description = README.md
Expand Down
66 changes: 66 additions & 0 deletions tests/test_binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ def bins_greater_than_one():
return [(0, 0.5), (0.5, 1.1)]


@pytest.fixture
def bins_alt1():
"""Alternative binning structure (4 bins)."""
return [(0, 0), (0, 0.01), (0.01, 0.05), (0.05, 1.0)]


def test_bin_boundaries(bins_less_than_zero, bins_greater_than_one):
"""Testing the bin boundaries."""
with pytest.raises(AssertionError):
Expand All @@ -28,3 +34,63 @@ def test_empty_bins():
"""Test that the bins are not empty."""
with pytest.raises(AssertionError):
GeoVar(bins=[])


def test_generate_bins(bins_less_than_zero, bins_greater_than_one):
"""Test the new generation of bins."""
geov_obj = GeoVar()
with pytest.raises(AssertionError):
geov_obj.generate_bins(bins=bins_less_than_zero)
with pytest.raises(AssertionError):
geov_obj.generate_bins(bins=bins_greater_than_one)
geov_obj.generate_bins(bins=[(0, 0), (0, 0.01), (0.01, 1.0)])


@pytest.fixture
def valid_freq_mat():
"""Frequency matrix for input to geovar."""
return "geovar/data/new_1kg_nygc.chr22.biallelic_snps.filt.n5000.freq.txt"


@pytest.fixture
def valid_freq_mat_gz():
"""Frequency matrix for input to geovar as a gzipped file."""
return "geovar/data/new_1kg_nygc.chr22.biallelic_snps.filt.n5000.freq.txt.gz"


def test_add_freq_mat(valid_freq_mat):
"""Test for addition of a valid frequency matrix file for GeoVar."""
geov_obj = GeoVar()
geov_obj.add_freq_mat(valid_freq_mat)


def test_geovar_binning(valid_freq_mat, bins_alt1):
"""Test GeoVar binning under multiple binning schemes."""
geov_obj = GeoVar()
geov_obj.add_freq_mat(valid_freq_mat)
geov_obj.geovar_binning()
geov_obj.generate_bins(bins_alt1)
geov_obj.geovar_binning()


def test_count_geovar_codes(valid_freq_mat, bins_alt1):
"""Test that counting of geovar codes works properly."""
geov_obj = GeoVar()
geov_obj.add_freq_mat(valid_freq_mat)
geov_obj.geovar_binning()
u, n_geovar, ncat = geov_obj.count_geovar_codes()
u, n_geovar, ncat = geov_obj.count_geovar_codes()
# Two non-zero categories ...
assert ncat == 2
assert all(n_geovar > 0)


def test_geovar_codes_streaming(valid_freq_mat, valid_freq_mat_gz):
"""Test that streaming in frequency tables (in both formats) is allowed."""
geov_obj = GeoVar()
geov_obj.geovar_codes_streaming(valid_freq_mat)
assert geov_obj.n_variants == 5000
assert geov_obj.pops.size == geov_obj.n_populations
geov_obj.geovar_codes_streaming(valid_freq_mat_gz)
assert geov_obj.n_variants == 5000
assert geov_obj.pops.size == geov_obj.n_populations
Loading

0 comments on commit 7d8084b

Please sign in to comment.