diff --git a/.build.sh b/.build.sh index 65984a2..7ff3130 100755 --- a/.build.sh +++ b/.build.sh @@ -35,8 +35,11 @@ EOF # cd +rm -rf kmerdb-0.7*dist-info/ kmerdb.egg-info/ build/ + python -m build auditwheel repair --plat manylinux2014_x86_64 dist/kmerdb-*linux_x86_64.whl mv wheelhouse/* dist rm dist/kmerdb-*linux_x86_64.whl + diff --git a/kmerdb/__init__.py b/kmerdb/__init__.py index 3f1e37a..c1fcdbb 100644 --- a/kmerdb/__init__.py +++ b/kmerdb/__init__.py @@ -1058,10 +1058,187 @@ def get_header(line, header): #kdb_out._handle.close() sys.stderr.write(config.DONE) +def make_kdbg(arguments): + """ + Another ugly function that takes a argparse Namespace object as its only positional argument + + Basically, from a fasta file, I want to generate a file format that consists of unique row ids, k-mer ids, adjacency list, + + and finally an int field (0 represents unused) representing the order of the row-kmer being used in a graph traversal. + + Note, that the .kdbg format may not be easily regenerated from a k-mer count vector alone, and thus a .fasta/.fastq is needed. + + The goal isn't to yet implement the traversal algorithm in this first commit. But instead I'd like to get the format specified. + """ + from multiprocessing import Pool + + import numpy as np + + from kmerdb import graph, kmer, util + from kmerdb.config import VERSION + + + + logger.debug("Printing entire CLI argparse option Namespace...") + logger.debug(arguments) + + # The extension should be .kdb because I said so. + logger.info("Checking extension of output file...") + if os.path.splitext(arguments.kdbg)[-1] != ".kdbg": + raise IOError("Destination .kdbg filepath does not end in '.kdbg'") + + file_metadata = [] + total_kmers = 4**arguments.k # Dimensionality of k-mer profile + N = total_kmers + + logger.info("Parsing {0} sequence files to generate a k-mer adjacency list...".format(len(list(arguments.seqfile)))) + infile = graph.Parseable(arguments) # + + + logger.info("Processing {0} fasta/fastq files across {1} processors...".format(len(list(arguments.seqfile)), arguments.parallel)) + logger.debug("Parallel (if specified) mapping the kmerdb.parse.parsefile() method to the seqfile iterable") + logger.debug("In other words, running the kmerdb.parse.parsefile() method on each file specified via the CLI") + if arguments.parallel > 1: + with Pool(processes=arguments.parallel) as pool: + data = pool.map(infile.parsefile, arguments.seqfile) + else: + data = list(map(infile.parsefile, arguments.seqfile)) + edges = list(map(lambda e: e[0], data)) + headers = list(map(lambda d: d[1], data)) + + sys.stderr.write("\n\n\tCompleted summation and metadata aggregation across all inputs...\n\n") + all_observed_kmers = int(np.sum(list(map(lambda h: h['total_kmers'], headers)))) + + + unique_kmers = int(np.sum(list(map(lambda h: h['unique_kmers'], headers)))) + total_nullomers = total_kmers - unique_kmers + + + + metadata=OrderedDict({ + "version": VERSION, + "metadata_blocks": 1, + "k": arguments.k, + "total_kmers": all_observed_kmers, + "unique_kmers": unique_kmers, + "unique_nullomers": total_nullomers, + "sorted": arguments.sorted, + "n1_dtype": "uint64", + "n2_dtype": "uint64", + "weight_dtype": "uint64", + "tags": [], + "files": headers + }) + + try: + np.dtype(metadata["n1_dtype"]) + np.dtype(metadata["n2_dtype"]) + np.dtype(metadata["weight_dtype"]) + except TypeError as e: + logger.error(e) + logger.error("kmerdb encountered a type error and needs to exit") + raise TypeError("Incorrect dtype.") + + + N = len(edges[0].keys()) # edges is a list of dicts, where keys are a 2-tuple (e.g. (15633431, 12202077) ) representing an edge + + + logger.debug("Initializing Numpy arrays of {0} uint zeroes for the edge lists...".format(N)) + n1 = np.array(range(N), dtype=metadata["n1_dtype"]) + n2 = np.array(range(N), dtype=metadata["n2_dtype"]) + weights = np.array(range(N), dtype=metadata["weight_dtype"]) + logger.info("Initialization of profile completed, using approximately {0} bytes per array".format(n1.nbytes)) + yaml.add_representer(OrderedDict, util.represent_ordereddict) + sys.stderr.write(yaml.dump(metadata, sort_keys=False)) + + logger.info("Generated metadata for .kdbg...") + + """ + At this point, unpacking should be second nature but it took about 2 minutes to get this sorted out, rebuilding, recounting k-mers, and watching dota2. + """ + # print("'edges' type:'") + # print(type(edges)) + # print(edges) + # sys.stderr.write("ALMOST DONE!!") + # sys.exit(1) + + + """ + Now we process the edge list dict, so that the output + """ + logger.info("Accumulating all edge weights...") + # Step 1: over each file's weighted edge list: initialize the result dict + result = {} + + print + for es in edges: + for i, e in enumerate(es.keys()): + try: + result[e] = 0 + except KeyError as e: + logger.debug("Could not find a valid (empty or recorded) edge relationship in the {0}'th input file's weighted edgelist") + raise e + # Step 2: over each file's weighted edge list, accumulate all counts observed in the .fa/.fq files. + for es in edges: + for i, e in enumerate(es.keys()): + result[e] += es[e] + # Step 3: pretty print a table of results. + + logger.debug("Storing all edges (node pairs) and weights in previously allocated numpy arrays") + for i, e in enumerate(result.keys()): + # Find the i'th key and unpack into variables + # Store the k-mer ids + n1[i] = e[0] # This is the result dicts i'th key's first node of the key's 2-tuple + n2[i] = e[1] # This is the result dicts i'th key's second node of the key's 2-tuple + # Store the edge weight + weights[i] = result[ (n1[i], n2[i] ) ] + + """ + Cause pandas isn't edge-y enough. + """ + # Convert the list of numpy arrays (the uint64 3-tuple of the edge) to pandas dataframe + #df = pd.DataFrame(twoD_weighted_edge_list) + #df.to_csv(sys.stdout, sep=arguments.output_delimiter, index=False) + + kdbg_out = graph.open(arguments.kdbg, 'wb', metadata=metadata) + + + try: + sys.stderr.write("\n\n\nWriting edge list to {0}...\n\n\n".format(arguments.kdbg)) + for i, node1 in enumerate(n1): + + node2 = n2[i] + w = weights[i] + + if arguments.edges is True: + print("{0}\t{1}\t{2}\t{3}".format(i, node1, node2, w)) + # node1, node2, weight + kdbg_out.write("{0}\t{1}\t{2}\t{3}\n".format(i, node1, node2, w)) + finally: + kdbg_out._write_block(kdbg_out._buffer) + kdbg_out._handle.flush() + kdbg_out._handle.close() + + sys.stderr.write("Total k-mers processed: {0}\n".format(all_observed_kmers)) + sys.stderr.write("Final nullomer count: {0}\n".format(total_nullomers)) + sys.stderr.write("Unique {0}-mer count: {1}\n".format(arguments.k, unique_kmers)) + sys.stderr.write("Total {0}-mer count: {1}\n".format(arguments.k, total_kmers)) + sys.stderr.write("="*30 + "\n") + sys.stderr.write(".kdbg stats:\n") + sys.stderr.write("-"*30 + "\n") + sys.stderr.write("Edges in file: {0}\n".format(N)) + sys.stderr.write("Non-zero weights: {0}\n".format(int(np.count_nonzero(weights)))) + sys.stderr.write("\nDone\n") + + logger.info("Done printing weighted edge list to .kdbg") + + sys.stderr.write(config.DONE) + + def profile(arguments): """ - A complex, near-end user function that handles an arparse Namespace as its only positional argument + A complex, near-end user function that handles a argparse Namespace as its only positional argument This function handles multiprocessing, NumPy type checking and array initialization, full metadata expansion if needed. @@ -1288,7 +1465,8 @@ def profile(arguments): kdb_out._write_block(kdb_out._buffer) kdb_out._handle.flush() kdb_out._handle.close() - + sys.stderr.write("Final stats:\n") + sys.stderr.write("=" * 30 + "\n") sys.stderr.write("Total k-mers processed: {0}\n".format(all_observed_kmers)) sys.stderr.write("Final nullomer count: {0}\n".format(total_nullomers)) sys.stderr.write("Unique {0}-mer count: {1}\n".format(arguments.k, unique_kmers)) @@ -1372,6 +1550,22 @@ def cli(): header_parser.add_argument("-j", "--json", help="Print as JSON. DEFAULT: YAML") header_parser.add_argument("kdb", type=str, help="A k-mer database file (.kdb)") header_parser.set_defaults(func=header) + + graph_parser = subparsers.add_parser("graph", help="Generate an adjacency list from .fa/.fq files") + graph_parser.add_argument("-v", "--verbose", help="Prints warnings to the console by default", default=0, action="count") + graph_parser.add_argument("-k", default=12, type=int, help="Choose k-mer size (Default: 12)") + + graph_parser.add_argument("-p", "--parallel", type=int, default=1, help="Shred k-mers from reads in parallel") + + graph_parser.add_argument("-b", "--fastq-block-size", type=int, default=100000, help="Number of reads to load in memory at once for processing") + graph_parser.add_argument("--both-strands", action="store_true", default=False, help="Retain k-mers from the forward strand of the fast(a|q) file only") + graph_parser.add_argument("--edges", action="store_true", default=False, help="Do not list all edges to stderr") + graph_parser.add_argument("--sorted", action="store_true", default=False, help=argparse.SUPPRESS) + #profile_parser.add_argument("--sparse", action="store_true", default=False, help="Whether or not to store the profile as sparse") + + graph_parser.add_argument("seqfile", nargs="+", type=str, metavar="<.fasta|.fastq>", help="Fasta or fastq files") + graph_parser.add_argument("kdbg", type=str, help=".kdbg file") + graph_parser.set_defaults(func=make_kdbg) view_parser = subparsers.add_parser("view", help="View the contents of the .kdb file") view_parser.add_argument("-v", "--verbose", help="Prints warnings to the console by default", default=0, action="count") diff --git a/kmerdb/config.py b/kmerdb/config.py index 79676ea..dbccd0d 100644 --- a/kmerdb/config.py +++ b/kmerdb/config.py @@ -17,9 +17,58 @@ -VERSION="0.7.6" +VERSION="0.7.7" header_delimiter = "\n" + ("="*24) + "\n" +graph_schema = { + "type": "object", + "properties": { + "version": {"type": "string"}, + "metadata_blocks": {"type": "number"}, + "k": {"type": "number"}, + "total_kmers": {"type": "number"}, + "unique_kmers": {"type": "number"}, + "unique_nullomers": {"type": "number"}, + "sorted": {"type": "boolean"}, + "n1_dtype": {"type": "string"}, + "n2_dtype": {"type": "string"}, + "weight_dtype": {"type": "string"}, + "tags": { + "type": "array", + "items": {"type": "string"} + }, + "files": { + "type": "array", + "items": { + "type": "object", + "properties": { + "filename": {"type": "string"}, + "sha256": { + "type": "string", + "minLength": 64, + "maxLength": 64 + }, + "md5": { + "type": "string", + "minLength": 32, + "maxLength": 32 + }, + "total_reads": {"type": "number"}, + "total_kmers": {"type": "number"}, + "unique_kmers": {"type": "number"}, + "nullomers": {"type": "number"} + }, + "required": ["filename", "sha256", "md5", "total_reads", "total_kmers", "unique_kmers", "nullomers"] + } + }, + "comments": { + "type": "array", + "items": {"type": "string"} + } + }, + "required": ["version", "metadata_blocks", "k", "tags", "files", "total_kmers", "unique_kmers", "unique_nullomers", "n1_dtype", "n2_dtype", "weight_dtype"] +} + metadata_schema = { "type": "object", "properties": { diff --git a/kmerdb/graph.py b/kmerdb/graph.py new file mode 100644 index 0000000..4f81d22 --- /dev/null +++ b/kmerdb/graph.py @@ -0,0 +1,399 @@ +''' + Copyright 2020 Matthew Ralston + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +''' +import pdb + +import sys +import os +import gzip +import tempfile +import yaml, json +from collections import OrderedDict +import math +import re + +from builtins import open as _open + +import jsonschema +from Bio import SeqIO, Seq, bgzf + + +import numpy as np + +from kmerdb import fileutil, seqparser, kmer, config, util + +# Logging configuration +import logging +logger = logging.getLogger(__file__) + + + +def open(filepath, mode="r", metadata=None, slurp:bool=False): + """ + Opens a file for reading or writing. Valid modes are 'xrwbt'. + Returns a lazy-loading KDBGReader object or a KDBGWriter object. + The data may be force loaded with 'slurp=True' + + :param filepath: + :type filepath: str + :param mode: + :type mode: str + :param metadata: The file header/metadata dictionary to write to the file. + :type metadata: dict + :param slurp: Immediately load all data into KDBGReader + :type slurp: bool + :returns: kmerdb.fileutil.KDBGReader/kmerdb.fileutil.KDBGWriter + :rtype: kmerdb.fileutil.KDBGReader + """ + if type(filepath) is not str: + raise TypeError("kmerdb.graph.open expects a str as its first positional argument") + elif type(mode) is not str: + raise TypeError("kmerdb.graph.open expects the keyword argument 'mode' to be a str") + elif ("w" in mode or "x" in mode) and (metadata is None or not isinstance(metadata, OrderedDict)): + raise TypeError("kmerdb.graph.open expects an additional metadata dictionary") + elif type(slurp) is not bool: + raise TypeError("kmerdb.graph.open expects a boolean for the keyword argument 'slurp'") + modes = set(mode) + if modes - set("xrwbt") or len(mode) > len(modes): + raise ValueError("invalid mode: {}".format(mode)) + + + creating = "x" in modes + reading = "r" in modes + writing = "w" in modes + binary = "b" in modes + text = "t" in modes + + if text and binary: + raise ValueError("can't have text and binary mode at once") + elif not (creating or reading or writing): + raise ValueError("must have exactly one or read/write") + + if "r" in mode.lower(): + return KDBGReader(filename=filepath, mode=mode, slurp=slurp) + elif "w" in mode.lower() or "x" in mode.lower(): + return KDBGWriter(filename=filepath, mode=mode, metadata=metadata) + else: + raise ValueError("Bad mode %r" % mode) + + + # DO STUFF + +class KDBGReader(bgzf.BgzfReader): + """ + A class to read .kdbg files + + """ + def __init__(): + + return + + +class KDBGWriter(bgzf.BgzfWriter): + """ + A wrapper class around Bio.bgzf.BgzfWriter to write a .kdbg file to disk. + + :ivar metadata: OrderedDict + :ivar filename: str + :ivar mode: str + :ivar fileobj: io.IOBase + :ivar compresslevel: int + """ + + def __init__(self, metadata:OrderedDict, filename=None, both_strands:bool=False, mode="w", fileobj=None, compresslevel=6): + """Initilize the class.""" + if not isinstance(metadata, OrderedDict): + raise TypeError("kmerdb.fileutil.KDBWriter expects a valid metadata object as its first positional argument") + try: + logger.debug("Validating metadata schema against the config.py header schema") + jsonschema.validate(instance=dict(metadata), schema=config.graph_schema) + self.metadata = metadata + self.k = self.metadata['k'] + except jsonschema.ValidationError as e: + logger.debug(metadata) + logger.debug(e) + logger.error("kmerdb.fileutil.KDBReader couldn't validate the header YAML") + raise e + + if fileobj: + assert filename is None, "kmerdb.graph expects filename to be None is fileobj handle is provided" + handle = fileobj + else: + if "w" not in mode.lower() and "a" not in mode.lower(): + raise ValueError("Must use write or append mode, not %r" % mode) + if "a" in mode.lower(): + raise NotImplementedError("Append mode is not implemented yet") + # handle = _open(filename, "ab") + else: + handle = _open(filename, "wb") + self._text = "b" not in mode.lower() + self._handle = handle + self._buffer = b"" if "b" in mode.lower() else "" + self.compresslevel = compresslevel + + """ + Write the header to the file + """ + + + logger.info("Constructing a new .kdbg file '{0}'...".format(self._handle.name)) + yaml.add_representer(OrderedDict, util.represent_ordereddict) + if "b" in mode.lower(): + metadata_bytes = bytes(yaml.dump(self.metadata, sort_keys=False), 'utf-8') + metadata_plus_delimiter_in_bytes = metadata_bytes + bytes(config.header_delimiter, 'utf-8') + self.metadata["metadata_blocks"] = math.ceil( sys.getsizeof(metadata_plus_delimiter_in_bytes) / ( 2**16 ) ) # First estimate + metadata_bytes = bytes(yaml.dump(self.metadata, sort_keys=False), 'utf-8') + metadata_bytes = metadata_bytes + bytes(config.header_delimiter, 'utf-8') + self.metadata["metadata_blocks"] = math.ceil( sys.getsizeof(metadata_bytes) / ( 2**16 ) ) # Second estimate + metadata_bytes = bytes(yaml.dump(self.metadata, sort_keys=False), 'utf-8') + metadata_bytes = metadata_bytes + bytes(config.header_delimiter, 'utf-8') + logger.info("Writing the {0} metadata blocks to the new file".format(self.metadata["metadata_blocks"])) + logger.debug(self.metadata) + logger.debug("Header is being written as follows:\n{0}".format(yaml.dump(self.metadata, sort_keys=False))) + # 03-04-2024 This is still not a completely functional method to write data to bgzf through the Bio.bgzf.BgzfWriter class included in BioPython + # I've needed to implement a basic block_writer, maintaining compatibility with the Biopython bgzf submodule. + #self.write(bytes(yaml.dump(metadata, sort_keys=False), 'utf-8')) + + for i in range(self.metadata["metadata_blocks"]): + metadata_slice = metadata_bytes[:65536] + metadata_bytes = metadata_bytes[65536:] + self._write_block(metadata_slice) + + #self._write_block + self._buffer = b"" + self._handle.flush() + elif "w" == mode.lower() or "x" == mode.lower(): + self.write(yaml.dump(metadata, sort_keys=False)) + self._buffer = "" + self._handle.flush() + else: + logger.error("Mode: {}".format(mode.lower())) + raise RuntimeError("Could not determine proper encoding for write operations to .kdbg file") + + + + +def parsefile(filepath:str, k:int, quiet:bool=True, b:int=50000, both_strands:bool=False): + """ + Parse a single sequence file. + + :param filepath: Path to a fasta or fastq file + :type filepath: str + :param k: Choice of k to shred k-mers with + :type k: int + :param b: Number of reads (per block) to process in parallel + :type b: int + :param both_strands: Strand specificity argument for k-mer shredding process + :type both_strands: bool + :returns: (graph, header_dictionary) header_dictionary is the file's metadata for the header block + :rtype: (numpy.ndarray, dict) + + """ + + from kmerdb import kmer + if type(filepath) is not str: + raise TypeError("kmerdb.graph.parsefile expects a str as its first positional argument") + elif not os.path.exists(filepath): + raise OSError("kmerdb.graph.parsefile could not find the file '{0}' on the filesystem".format(filepath)) + elif type(k) is not int: + raise TypeError("kmerdb.graph.parsefile expects an int as its second positional argument") + elif type(b) is not int: + raise TypeError("kmerdb.parse.parsefile expects the keyword argument 'b' to be an int") + elif type(both_strands) is not bool: + raise TypeError("kmerdb.graph.parsefile expects the keyword argument 'both_strands' to be a bool") + + total_kmers = 4**k + kmers = [] + + + logger.debug("Initializing graph parser...") + seqprsr = seqparser.SeqParser(filepath, b, k) + fasta = not seqprsr.fastq # Look inside the seqprsr object for the type of file + + # Initialize the graph structure + Kmer = kmer.Kmers(k, strand_specific=not both_strands, verbose=fasta, all_metadata=True) # A wrapper class to shred k-mers with + + + recs = [r for r in seqprsr] + + if not fasta: + logger.debug("Read exactly {0} records from the seqparser object".format(len(recs))) + assert len(recs) <= b, "The seqparser should return exactly {0} records at a time".format(b) + else: + logger.debug("Skipping the block size assertion for fasta files") + + logger.info("Read {0} sequences from the {1} seqparser object".format(len(recs), "fasta" if fasta else "fastq")) + + + while len(recs): + num_recs = len(recs) + logger.info("Processing a block of {0} reads/sequences".format(num_recs)) + + kmer_ids = [x for y in list(map(Kmer._shred_for_graph, recs)) for x in y] + # Flatmap to 'kmer_ids', the dictionary of {'id': read_id, 'kmers': [ ... ]} + logger.debug("\n\nAcquiring list of all k-mer ids from {0} sequence records...\n\n".format(num_recs)) + + kmers = kmers + kmer_ids + # END WHILE + recs = [r for r in seqprsr] # The next block of exactly 'b' reads + # This will be logged redundantly with the sys.stderr.write method calls at line 141 and 166 of seqparser.py (in the _next_fasta() and _next_fastq() methods) + #sys.stderr("\n") + logger.info("Read {0} more records from the {1} seqparser object".format(len(recs), "fasta" if fasta else "fastq")) + + logger.info("Read {0} k-mers from the input file".format(len(kmers))) + + sys.stderr.write("\n\n\n") + logger.info("K-mer counts read from input(s)") + sys.stderr.write("="*40 + "\n") + logger.info("K-space dimension: {0}".format(4**k)) + logger.info("Number of k-mers: {0}".format(len(kmers))) + sys.stderr.write("\n\n\n") + + logger.info("Constructing weighted edge list...") + + edge_list = make_graph(kmers, k, quiet=quiet) + logger.info("Number of edges: {0}".format(len(edge_list))) + + sys.stderr.write("Edge list generation completed...") + return (edge_list, seqprsr.header_dict()) + +def make_graph(kmer_ids, k:int=None, quiet:bool=True): + """ + Make a neighbor graph from a k-mer id list. + + More specifically, this is a edge list + + This particular adjacency list will have redundant relationships: + + """ + if k is None: + raise TypeError("kmerdb.graph.make_graph expects the keyword argument k to be an int") + + """ + Iterate over all k-mer ids, creating lists of neighbors derived from the kmer. + Omit the self relationship, as the kmer is not a neighbor of itself. + Convert to k-mer ids and add the adjacency tuple to the output + + A minimal neighbor graph is + """ + edges = {} + for k_id in set(kmer_ids): + current_kmer = kmer.id_to_kmer(k_id, k) + + char_goes_on_right = current_kmer[1:] # 11 characters (k - 1) (default 12), remove the left most, so new char goes on right to make the neighbor + char_goes_on_left = current_kmer[:-1] # 11 characters ( k - 1) + #common = current_kmer[1:-1] # 10 characters in common with all k-mers + + # Create the neighbor lists from the list of chars, prepended or appended to the k-mer suffix/prefix + char_first = list(map(lambda c: c + char_goes_on_left, kmer.binaryToLetter)) + char_last = list(map(lambda c: char_goes_on_right + c, kmer.binaryToLetter)) + char_first_ids = list(map(kmer.kmer_to_id, char_first)) + char_last_ids = list(map(kmer.kmer_to_id, char_last)) + #r = {"kmer_id": k_id, "left_neighbors": lneighbors_ids, "right_neighbors": rneighbors_ids} + neighbors_current_to_last = list(map(lambda x: (k_id, x), char_last_ids)) + neighbors_first_to_current = list(map(lambda x: (x, k_id), char_first_ids)) + neighbors = (neighbors_current_to_last + neighbors_first_to_current) + + """ + Hur dur + + at this point, I realized that that I had been removing the character on the wrong side from the new char insertion. + """ + # if k_id == 15633431 or k_id == 12202077: + # logger.debug("-"*11) + # logger.debug("K-mer {0} and all (non-self) neighbors:".format(k_id)) + # logger.debug(current_kmer) + # logger.debug("-"*11) + # logger.debug("<:" + ",".join(char_first) + " |||| " + common + " |||| " + ",".join(char_last) + ">:") + # logger.debug(",".join(list(map(str, char_first_ids))) + " |||| " + ",".join(list(map(str, char_last_ids)))) + for pair_ids in neighbors: + edges[pair_ids] = 0 + + """ + Old win condition. I wanted to see that a neighbor pair that was originally not found: + The 12-mers with the following ids, which had this subsequence in common 'GTGGATAACCT', was not found in the keys of the edge list dictionary. + """ + # if (15633431, 12202077) in neighbors: + # print("Sequence:") + # print(current_kmer) + # print("SUCCESS") + # print(edges) + # sys.exit(1) + """ + At this point, the adjacency structure is a simple list of tuples. Redundant tuples may exist in disparate parts of the graph. + + The structure of the neighbor_graph is [(from_node_id, to_node_id), ...] + The 'minimal neighbor graph' is provided by the variable 'min_neighbor_graph' from the preceding code block + """ + + """ + Next, the redundant k-mer id pairs are tallied, so that the edge weights are calculated + """ + num_kmer_ids = len(kmer_ids) + for i, val in enumerate(kmer_ids): + if i+1 == num_kmer_ids: + break + else: + """ + The following constitutes a 'neighboring' pair/edge of two k-mers (nodes): + the k-mers are sequential in the original k-mer id list (implicitly sorted by virtue of the orientation of the read in the .fasta/.fastq file) + this list of k-mers actual neighbors can be validated by inspecting debug level output below (-vv). + """ + pair_ids = (val, kmer_ids[i+1]) # pair of 'neighboring' k-mer ids + pair = (kmer.id_to_kmer(val, k), kmer.id_to_kmer(kmer_ids[i+1], k)) + + k1, k2 = pair + k1 = k1[1:] + k2 = k2[:-1] + try: + assert pair[0][1:] == pair[1][:-1], "kmerdb.graph expects neighboring k-mers to have at least k-1 residues in common. Must be from a fastq file." + except AssertionError as e: + logger.error("must be fastq") + + logger.error("mhmm") + raise e + if quiet is False: + logger.debug("Edge: {0} => {1}".format(pair[0], pair[1])) + try: + # Accumulate for each edge pair + edges[pair_ids] += 1 + except KeyError as e: + logger.error("Invalid key(pair) used to access edge list") + sys.stderr.write("PAIR: {0} => {1}".format(pair[0], pair[1])) + sys.stderr.write("pair ids: {0}".format(pair_ids)) + sys.stderr.write("Valid keys:") + sys.stderr.write(list(filter(lambda e: e[0] == 15633431, edges.keys()))) + raise e + + return edges + + +class Parseable: + def __init__(self, arguments): + self.arguments = arguments + + + def parsefile(self, filename): + """Wrapper function for graph.parsefile to keep arguments succinct for deployment through multiprocessing.Pool + + :param filename: the filepath of the fasta(.gz)/fastq(.gz) to process with parsefile -> seqparser + :type filename: str + :returns: (db, m, n) + """ + return parsefile(filename, self.arguments.k, quiet=not self.arguments.edges, b=self.arguments.fastq_block_size, both_strands=self.arguments.both_strands) + + + diff --git a/kmerdb/kmer.py b/kmerdb/kmer.py index 8e64b4d..171332c 100644 --- a/kmerdb/kmer.py +++ b/kmerdb/kmer.py @@ -110,21 +110,14 @@ def __init__(self, k, strand_specific=True, verbose=False, all_metadata=False): self.all_metadata = all_metadata self.__permitted_chars = set("ACTGRYSWKMBDHVN") - def shred(self, seqRecord): + def validate_seqRecord_and_detect_IUPAC(self, seqRecord:Bio.SeqRecord.SeqRecord, quiet_iupac_warning:bool=True): """ - Take a seqRecord fasta/fastq object and slice according to the IUPAC charset. - Doublets become replace with two counts, etc. - - :param seqRecord: - :type seqRecord: Bio.SeqRecord.SeqRecord - :returns: a parallel map - :rtype: dict - + Helper method for validating seqRecord and warnings for non-standard IUPAC residues. """ - - - if not isinstance(seqRecord, Bio.SeqRecord.SeqRecord): - raise TypeError("kmerdb.kmer.Kmers expects a Bio.SeqRecord.SeqRecord object as its first positional argument") + if type(quiet_iupac_warning) is not bool: + raise TypeError("kmerdb.kmer.validate_seqRecord_and_detect_IUPAC expects keyword argument 'quiet_iupac_warning' to be a bool") + elif not isinstance(seqRecord, Bio.SeqRecord.SeqRecord): + raise TypeError("kmerdb.kmer.validate_seqRecord_and_detect_IUPAC expects a Bio.SeqRecord.SeqRecord object as its first positional argument") letters = set(seqRecord.seq) seqlen = len(seqRecord.seq) if seqlen < self.k: @@ -137,10 +130,62 @@ def shred(self, seqRecord): logger.warning(list(letters - self.__permitted_chars)) raise ValueError("Non-IUPAC symbols detected in the fasta file") elif len(all_iupac_symbols) > 0: - logger.warning("Will completely refuse to include k-mers with 'N'") - logger.warning("All counts for k-mers including N will be discarded") - logger.warning("Other IUPAC symbols will be replaced with their respective pairs/triads") - logger.warning("And a count will be given to each, rather than a half count") + if quiet_iupac_warning is False: + logger.warning("Will completely refuse to include k-mers with 'N'") + logger.warning("All counts for k-mers including N will be discarded") + logger.warning("Other IUPAC symbols will be replaced with their respective pairs/triads") + logger.warning("And a count will be given to each, rather than a half count") + elif quiet_iupac_warning is True: + logger.warning("Suppressing warning that non-standard IUPAC residues (including N) are detected.") + return (letters, seqlen) + + + def _shred_for_graph(self, seqRecord:Bio.SeqRecord.SeqRecord): + """ + Introduced in 0.7.7, required for valid assembly. Simply omits the rejection of IUPAC non-standard residues. + + """ + kmers = [] + + letters, seqlen = self.validate_seqRecord_and_detect_IUPAC(seqRecord, quiet_iupac_warning=True) + + + # Iterate over *each* k-mer in the sequence by index + for i in range(seqlen - self.k + 1): + s = seqRecord.seq[i:(i+self.k)] # Creates the k-mer as a slice of a seqRecord.seq + # No non-standard IUPAC residues allowed for _shred for use in graph.py + nonstandard_iupac_symbols = list(set(s) - standard_letters) + non_iupac_symbols = list(set(s) - self.__permitted_chars) + + if len(non_iupac_symbols) > 1: + logger.error("Non-IUPAC symbols:") + logger.error(non_iupac_symbols) + raise RuntimeError("Non-IUPAC symbol(s) detected...") + elif len(nonstandard_iupac_symbols) == 0: + #logger.debug("Perfect sequence content (ATCG) detected...") + kmers.append(kmer_to_id(s)) + elif len(nonstandard_iupac_symbols) == 1 and iupac_symbols[0] == n: + #logger.debug("Only N-abnormal sequence content detected (aside from ATCG)...") + kmers.append(kmer_to_id(s)) + elif len(nonstandard_iupac_symbols) > 1: + logger.error("Assembly cannot continue with this k-mer, and it will be discarded, possibly affecting assembly process") + raise RuntimeError("Non-standard IUPAC symbols detected in .fa/.fq file...") + return kmers + + def shred(self, seqRecord): + """ + Take a seqRecord fasta/fastq object and slice according to the IUPAC charset. + Doublets become replace with two counts, etc. + + :param seqRecord: + :type seqRecord: Bio.SeqRecord.SeqRecord + :returns: a parallel map + :rtype: dict + + """ + letters, seqlen = self.validate_seqRecord_and_detect_IUPAC(seqRecord, quiet_iupac_warning=True) + + kmers = [] starts = [] reverses = [] diff --git a/pyproject.toml b/pyproject.toml index 3fb858c..559227b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ build-backend = "setuptools.build_meta" [project] name = "kmerdb" -version = "0.7.6" +version = "0.7.7" description = "Yet another kmer library for Python" readme = "README.md" authors = [{name="Matt Ralston ", email="mralston.development@gmail.com"}] diff --git a/setup.py b/setup.py index a444241..5645b2c 100644 --- a/setup.py +++ b/setup.py @@ -113,7 +113,7 @@ def can_import(module_name): EMAIL = 'mrals89@gmail.com' AUTHOR = 'Matthew Ralston' REQUIRES_PYTHON = '>=3.8.0' -VERSION = "0.7.6" +VERSION = "0.7.7" KEYWORDS = ["bioinformatics", "fastq", "fasta", "k-mer", "kmer", "k-merdb", "kmerdb", "kdb"], CLASSIFIERS = [ "Development Status :: 1 - Planning",