Skip to content

Commit

Permalink
Closes #120, Fixes #121. Bump version number. Tested.
Browse files Browse the repository at this point in the history
  • Loading branch information
MatthewRalston committed Feb 3, 2024
1 parent 05a7306 commit 52b19f1
Show file tree
Hide file tree
Showing 5 changed files with 239 additions and 294 deletions.
28 changes: 10 additions & 18 deletions kmerdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -916,7 +916,6 @@ def header(arguments):
logger.warning("KDB file version is out of date, may be incompatible with current fileutil.KDBReader class")
N = 4**kdb_in.metadata["k"]

assert kdb_in.profile.size == N, "view | read profile size did not match N from the header metadata"
assert kdb_in.kmer_ids.size == N, "view | read kmer_ids size did not match N from the header metadata"
assert kdb_in.counts.size == N, "view | read counts size did not match N from the header metadata"
assert kdb_in.frequencies.size == N, "view | read frequencies size did not match N from the header metadata"
Expand Down Expand Up @@ -985,28 +984,24 @@ def get_header(line, header):
print(config.header_delimiter)
logger.info("Reading from file...")
logger.debug("I cut off the json-formatted unstructured column for the main view.")
#logger.debug(kdb_in.profile)
try:
if not arguments.un_sort and arguments.re_sort and metadata["sorted"] is True:
kmer_ids_sorted_by_count = np.lexsort((kdb_in.counts, kdb_in.kmer_ids))
reverse_kmer_ids_sorted_by_count = np.flipud(kmer_ids_sorted_by_count)
for i, idx in enumerate(kmer_ids_sorted_by_count):
p = kdb_in.profile[i]
kmer_id = kdb_in.kmer_ids[kdb_in.profile[i]]
logger.debug("The first is an implicit row-index. The second is the corresponding profile. The third is a k-mer id, then the counts and frequencies.")
logger.debug("{0}\t{1}\t{2}\t{3}\t{4}".format(i, p, kmer_id, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id]))
kmer_id = kdb_in.kmer_ids[i]
logger.debug("The first is an implicit row-index. The second is a k-mer id, then the counts and frequencies.")
logger.debug("{0}\t{1}\t{2}\t{3}".format(i, kmer_id, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id]))

print("{0}\t{1}\t{2}\t{3}\t{4}".format(i, kmer_id, p, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id]))
print("{0}\t{1}\t{2}\t{3}".format(i, kmer_id, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id]))
else:
for i, idx in enumerate(kdb_in.kmer_ids):
p = kdb_in.profile[i]
kmer_id = kdb_in.kmer_ids[kdb_in.profile[i]]
kmer_id = kdb_in.kmer_ids[idx]
logger.debug("The row in the file should follow this order:")
logger.debug("The first is an implicit row-index. The second is the corresponding profile. The third is a k-mer id, then the counts and frequencies.")
logger.debug("{0}\t{1}\t{2}\t{3}\t{4}".format(i, p, kmer_id, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id]))
logger.debug("The first is an implicit row-index. The second is a k-mer id, then the counts and frequencies.")
logger.debug("{0}\t{1}\t{2}\t{3}".format(i, kmer_id, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id]))
try:
if arguments.un_sort is True:
assert p == idx, "view | kmer_id {0} didn't match the loaded profile {1}".format(idx, p)
assert kmer_id == idx, "view | kmer_id {0} didn't match the expected k-mer id.".format(idx, kmer_id)
assert i == kmer_id, "view | kmer_id {0} didn't match the implicit index {1}".format(idx, i)
else:
Expand All @@ -1019,9 +1014,9 @@ def get_header(line, header):
logger.debug("{0} line:".format(i))
logger.debug("=== = = = ======= = = = = = = |")
if arguments.un_sort is True:
print("{0}\t{1}\t{2}\t{3}\t{4}".format(i, i, idx, kdb_in.counts[idx], kdb_in.frequencies[idx]))
print("{0}\t{1}\t{2}\t{3}".format(i, idx, kdb_in.counts[idx], kdb_in.frequencies[idx]))
else:
print("{0}\t{1}\t{2}\t{3}\t{4}".format(i, p, idx, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id]))
print("{0}\t{1}\t{2}\t{3}".format(i, idx, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id]))
# I don't think anyone cares about the graph representation.
# I don't think this actually matters because I can't figure out what the next data structure is.
# Is it a Cypher query and creation node set?
Expand All @@ -1042,7 +1037,6 @@ def get_header(line, header):
logger.debug("Creating '{0}'...".format(arguments.kdb_out))
if arguments.kdb_out is not None:
with fileutil.open(arguments.kdb_in, 'r', dtype=suggested_dtype, sort=arguments.sorted, slurp=True) as kdb_in:
assert kdb_in.profile.size == N, "view | read profile size did not match N from the header metadata"
assert kdb_in.kmer_ids.size == N, "view | read kmer_ids size did not match N from the header metadata"
assert kdb_in.counts.size == N, "view | read counts size did not match N from the header metadata"
assert kdb_in.frequencies.size == N, "view | read frequencies size did not match N from the header metadata"
Expand All @@ -1052,10 +1046,8 @@ def get_header(line, header):
kmer_id = idx
seq = kmer.id_to_kmer(kmer_id, arguments.k)
kmer_metadata = kmer.neighbors(seq, arguments.k)
kmer_id = kdb_in.kmer_ids[kdb_in.profile[i]]
p = kdb_in.profile[j]
logger.debug("The first is the actual row id. This is the recorded row-id in the file. This should always be sequential. Next is the k-mer id. ")
kdb_out.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(i, p, kmer_id, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id], kmer_metadata))
kdb_out.write("{0}\t{1}\t{2}\t{3}\n".format(i, kmer_id, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id], kmer_metadata))

except StopIteration as e:
logger.error(e)
Expand Down
2 changes: 1 addition & 1 deletion kmerdb/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@



VERSION="0.7.5"
VERSION="0.7.6"
header_delimiter = "\n" + ("="*24) + "\n"

metadata_schema = {
Expand Down
Loading

0 comments on commit 52b19f1

Please sign in to comment.