from __future__ import annotations

import argparse
import itertools
import os
import sys
from collections import Counter, defaultdict
from collections.abc import Sized
from functools import total_ordering
from hashlib import md5
from pathlib import Path
from random import uniform
from typing import (
    Callable,
    Dict,
    Generator,
    Iterable,
    List,
    Literal,
    Optional,
    Sequence,
    Set,
    SupportsIndex,
    TextIO,
    Tuple,
    Union,
)

from Bio.Data.IUPACData import (
    ambiguous_dna_complement,  # type: ignore
    ambiguous_rna_complement,  # type: ignore
)
from Bio.Seq import translate  # type: ignore

from dark.aaVars import (
    AA_LETTERS,
    NONE,
    PROPERTIES,
    PROPERTY_DETAILS,
    START_CODON,
    STOP_CODONS,
)
from dark.dna import AMBIGUOUS, BASES_TO_AMBIGUOUS, FloatBaseCounts
from dark.errors import ReadLengthsNotIdenticalError
from dark.filter import SequenceFilter, TitleFilter
from dark.hsp import HSP


def _makeComplementTable(complementData: dict) -> Sequence[str]:
    """
    Make a sequence complement table.

    @param complementData: A C{dict} whose keys and values are strings of
        length one. A key, value pair indicates a substitution that should
        be performed during complementation.
    @return: A 256 character string that can be used as a translation table
        by the C{translate} method of a Python string.
    """
    table = list(range(256))
    for _from, to in complementData.items():
        table[ord(_from[0].lower())] = ord(to[0].lower())
        table[ord(_from[0].upper())] = ord(to[0].upper())
    return tuple(map(chr, table))


@total_ordering
class Read(Sized):
    """
    Hold information about a single read.

    @param id: A C{str} describing the read.
    @param sequence: A C{str} of sequence information (might be
        nucleotides or proteins).
    @param quality: An optional C{str} of phred quality scores. If not C{None},
        it must be the same length as C{sequence}.
    @raise ValueError: if the length of the quality string (if any) does not
        match the length of the sequence.
    """

    ALPHABET: Optional[set] = None

    def __init__(self, id: str, sequence: str, quality: Optional[str] = None):
        if quality is not None and len(quality) != len(sequence):
            raise ValueError(
                "Invalid read: sequence length (%d) != quality length (%d)"
                % (len(sequence), len(quality))
            )

        self.id = id
        self.sequence = sequence
        self.quality = quality

    def __eq__(self, other: object) -> bool:
        if isinstance(other, Read):
            return (
                self.id == other.id
                and self.sequence == other.sequence
                and self.quality == other.quality
            )
        else:
            return NotImplemented

    def __ne__(self, other: object) -> bool:
        if isinstance(other, Read):
            return not self == other
        else:
            return NotImplemented

    def __lt__(self, other: object) -> bool:
        if isinstance(other, Read):
            return (self.id, self.sequence, self.quality) < (
                other.id,
                other.sequence,
                other.quality,
            )
        else:
            return NotImplemented

    def __len__(self) -> int:
        return len(self.sequence)

    def __hash__(self) -> int:
        """
        Calculate a hash key for a read.

        @return: The C{int} hash key for the read.
        """
        if self.quality is None:
            return hash(
                md5(
                    self.id.encode("UTF-8") + b"\0" + self.sequence.encode("UTF-8")
                ).digest()
            )
        else:
            return hash(
                md5(
                    self.id.encode("UTF-8")
                    + b"\0"
                    + self.sequence.encode("UTF-8")
                    + b"\0"
                    + self.quality.encode("UTF-8")
                ).digest()
            )

    def __getitem__(self, item: Union[int, slice]) -> Read:
        sequence = self.sequence[item]
        quality = None if self.quality is None else self.quality[item]
        return self.__class__(self.id, sequence, quality)

    def toString(self, format_: str = "fasta") -> str:
        """
        Convert the read to a string format.

        @param format_: Either 'fasta', 'fastq' or 'fasta-ss'.
        @raise ValueError: if C{format_} is 'fastq' and the read has no quality
            information, if C{format_} is 'fasta-ss' and the read has no
            structure information, or if an unknown format is requested.
        @return: A C{str} representing the read in the requested format.
        """
        if format_ == "fasta":
            return ">%s\n%s\n" % (self.id, self.sequence)
        elif format_ == "fastq":
            if self.quality is None:
                raise ValueError("Read %r has no quality information" % self.id)
            else:
                return "@%s\n%s\n+%s\n%s\n" % (
                    self.id,
                    self.sequence,
                    self.id,
                    self.quality,
                )
        else:
            raise ValueError("Format must be either 'fasta', 'fastq' or 'fasta-ss'.")

    def toDict(self) -> dict:
        """
        Get information about this read in a dictionary.

        @return: A C{dict} with keys/values for the attributes of self.
        """
        return {
            "id": self.id,
            "sequence": self.sequence,
            "quality": self.quality,
        }

    def reverse(self) -> Read:
        """
        Reverse a read (note that this is NOT a reverse complement).

        @return: The reversed sequence as an instance of the current class.
        """
        return self.__class__(
            self.id,
            self.sequence[::-1],
            None if self.quality is None else self.quality[::-1],
        )

    def rotate(self, n: int, inPlace: bool = True) -> Read:
        """
        Rotate the sequence left or right.

        @param n: The number of characters to rotate by. Positive means to
            rotate to the right, negative to the left.
        @return: The modified C{self} if C{inPlace} is true, else a new C{Read}.
        """
        sequence = self.sequence
        quality = self.quality

        if n == 0 or not sequence:
            newSequence = sequence
            newQuality = quality
        elif n > 0:
            # Rotate right
            n %= len(sequence)
            newSequence = sequence[-n:] + sequence[:-n]
            newQuality = None if quality is None else quality[-n:] + quality[:-n]
        else:
            # Rotate left
            n = -n % len(sequence)
            newSequence = sequence[n:] + sequence[:n]
            newQuality = None if quality is None else quality[n:] + quality[:n]

        if inPlace:
            self.sequence = newSequence
            self.quality = newQuality
            return self
        else:
            return self.__class__(self.id, newSequence, newQuality)

    @classmethod
    def fromDict(cls, d: dict) -> Read:
        """
        Create a new instance from attribute values provided in a dictionary.

        @param d: A C{dict} with keys/values for the attributes of a new
            instance of this class. Keys 'id' and 'sequence' with C{str} values
            must be provided. A 'quality' C{str} key is optional.
        @return: A new instance of this class, with values taken from C{d}.
        """
        return cls(d["id"], d["sequence"], d.get("quality"))

    def lowComplexityFraction(self) -> float:
        """
        What fraction of a read's bases are in low-complexity regions?
        By convention, a region of low complexity is indicated by lowercase
        base letters.

        @return: The C{float} representing the fraction of bases in the
            read that are in regions of low complexity.
        """
        length = len(self)
        if length:
            lowerCount = len(list(filter(str.islower, self.sequence)))
            return float(lowerCount) / length
        else:
            return 0.0

    def walkHSP(
        self, hsp: HSP, includeWhiskers: bool = True
    ) -> Generator[Tuple[int, str, bool], None, None]:
        """
        Provide information about exactly how a read matches a subject, as
        specified by C{hsp}.

        @param hsp: An C{HSP} instance.
        @param includeWhiskers: If C{True} yield information from the
            (possibly empty) non-matching ends of the read.
        @return: A generator that yields (offset, residue, inMatch) tuples.
            The offset is the offset into the matched subject. The residue is
            the base in the read (which might be '-' to indicate a gap in the
            read was aligned with the subject at this offset). inMatch will be
            C{True} for residues that are part of the HSP match, and C{False}
            for the (possibly non-existent) parts of the read that fall outside
            the HSP (aka, the "whiskers" in an alignment graph).
        """

        # It will be easier to understand the following implementation if
        # you refer to the ASCII art illustration of an HSP in the
        # dark.hsp._Base class in hsp.py

        # Left whisker.
        if includeWhiskers:
            readOffset = 0
            subjectOffset = hsp.readStartInSubject
            while subjectOffset < hsp.subjectStart:
                yield (subjectOffset, self.sequence[readOffset], False)
                readOffset += 1
                subjectOffset += 1

        # Match.
        for subjectOffset, residue in enumerate(
            hsp.readMatchedSequence, start=hsp.subjectStart
        ):
            yield (subjectOffset, residue, True)

        # Right whisker.
        if includeWhiskers:
            readOffset = hsp.readEnd
            subjectOffset = hsp.subjectEnd
            while subjectOffset < hsp.readEndInSubject:
                yield (subjectOffset, self.sequence[readOffset], False)
                readOffset += 1
                subjectOffset += 1

    def checkAlphabet(self, count: int = 10) -> set:
        """
        A function which checks whether the sequence in a L{dark.Read} object
        corresponds to its readClass. For AA reads, more testing is done in
        dark.Read.AARead.checkAlphabet.

        @param count: An C{int}, indicating how many bases or amino acids at
            the start of the sequence should be considered. If C{None}, all
            bases are checked.
        @return: A C{set} of the read characters in the first C{count}
            positions of sequence if these are a subset of the allowed alphabet for this
            read class, or if the read class has a C{None} alphabet.
        @raise ValueError: If the sequence alphabet is not a subset of the read
            class alphabet.
        """
        if count is None:
            readLetters = set(self.sequence.upper())
        else:
            readLetters = set(self.sequence.upper()[:count])
        # Check if readLetters is a subset of self.ALPHABET.
        if self.ALPHABET is None or readLetters.issubset(self.ALPHABET):
            return readLetters
        raise ValueError(
            "Read alphabet (%r) is not a subset of expected "
            "alphabet (%r) for read class %s."
            % (
                "".join(sorted(readLetters)),
                "".join(sorted(self.ALPHABET)),
                str(self.__class__.__name__),
            )
        )

    def newFromSites(self, sites: Set[int], exclude: bool = False) -> Read:
        """
        Create a new read from self, with only certain sites.

        @param sites: A set of C{int} 0-based sites (i.e., indices) in
            sequences that should be kept. If C{None} (the default), all sites
            are kept.
        @param exclude: If C{True} the C{sites} will be excluded, not
            included.
        @return: A new C{Read} instance.
        """
        if sites is None:
            sites = set(range(len(self)))

        if exclude:
            sites = set(range(len(self))) - sites

        newSequence = []
        if self.quality:
            newQuality = []
            for index, (base, quality) in enumerate(zip(self.sequence, self.quality)):
                if index in sites:
                    newSequence.append(base)
                    newQuality.append(quality)
            read = self.__class__(self.id, "".join(newSequence), "".join(newQuality))
        else:
            for index, base in enumerate(self.sequence):
                if index in sites:
                    newSequence.append(base)
            read = self.__class__(self.id, "".join(newSequence))

        return read

    def find(
        self,
        pattern: str,
        start: int = 0,
        end: bool = False,
        caseSensitive: bool = True,
        ignoreGaps: bool = False,
        gapCharacter: str = "-",
    ) -> int:
        """
        Find the first occurrence (from a given offset) of a pattern in our sequence.

        @param pattern: The subsequence to look for. Note that this is not a regular
            expression (we could add support for that).
        @param start: The offset to start searching from.
        @param end: If true, return the index of the end of the match. Note that this
            is the Python offset of the character immediately after the end of the
            matched string (conveniently, for a human counting from 1, this is the
            inclusive index of the end of the match).
        @param caseSensitive: If true, match the unchanged sequence and pattern. Else,
            the match is done case-insensitively.
        @param ignoreGaps: If true, gaps will be removed from both the read sequence
            and the pattern before the search. The returned offset is still into the
            original (gapped) sequence. This is useful for finding a sequence in a
            gapped multiple sequence alignment.
        @param gapCharacter: The single character used to indicate a gap. Only used
            if ignoreGaps is true.
        @return: The index of the pattern in self.sequence (or the index just beyond the
            pattern if C{end} is true), or -1 if the pattern is not found.
        """
        # Avoid a circular import.
        from dark.alignments import alignmentEnd, getGappedOffsets

        sequence = origSequence = self.sequence

        if ignoreGaps:
            gappedOffsets = getGappedOffsets(sequence)
            sequence = sequence.replace(gapCharacter, "")

        if not caseSensitive:
            sequence = sequence.upper()
            pattern = pattern.upper()

        index = sequence.find(pattern, start)

        if index != -1:
            if ignoreGaps:
                index = gappedOffsets[index]
            if end:
                index = alignmentEnd(origSequence, index, len(pattern), gapCharacter)

        return index

    def getPrefixAndSuffixOffsets(
        self,
        prefix: str | None,
        suffix: str | None,
        ignoreGaps: bool = False,
        gapCharacter: str = "-",
    ) -> tuple[int, int]:
        """
        Find the indices of a prefix and (end of) a suffix in a read. Return -1 for
        patterns that are not be found.

        Note that there is no checking that the suffix ends after the start of the
        prefix (i.e., that the 'end' value is greater than (or >=) the 'start' value.
        Our caller needs to detect this and decide what to do. That's because there
        might be legitimate reasons for wanting to do that kind of search.
        """
        start = (
            self.find(
                prefix,
                caseSensitive=False,
                ignoreGaps=ignoreGaps,
                gapCharacter=gapCharacter,
            )
            if prefix
            else -1
        )
        end = (
            self.find(
                suffix,
                caseSensitive=False,
                ignoreGaps=ignoreGaps,
                gapCharacter=gapCharacter,
                end=True,
            )
            if suffix
            else -1
        )

        return start, end


class _NucleotideRead(Read):
    """
    Holds methods to work with nucleotide (DNA and RNA) sequences.
    """

    COMPLEMENT_TABLE: Sequence[Union[str, None]] = [None]

    def translations(
        self: Union[_NucleotideRead, DNARead, RNARead],
    ) -> Generator[TranslatedRead, None, None]:
        """
        Yield all six translations of a nucleotide sequence.

        @return: A generator that produces six L{TranslatedRead} instances.
        """
        rc = self.reverseComplement().sequence
        for reverseComplemented in False, True:
            for frame in 0, 1, 2:
                seq = rc if reverseComplemented else self.sequence
                # Get the suffix of the sequence for translation. I.e.,
                # skip 0, 1, or 2 initial bases, depending on the frame.
                # Note that this makes a copy of the sequence, which we can
                # then safely append 'N' bases to to adjust its length to
                # be zero mod 3.
                suffix = seq[frame:]
                lengthMod3 = len(suffix) % 3
                if lengthMod3:
                    suffix += "NN" if lengthMod3 == 1 else "N"
                yield TranslatedRead(
                    self, translate(suffix), frame, reverseComplemented
                )

    def reverseComplement(self: Union[_NucleotideRead, DNARead, RNARead]) -> Read:
        """
        Reverse complement a nucleotide sequence.

        @return: The reverse complemented sequence as an instance of the
            current class.
        """
        quality = None if self.quality is None else self.quality[::-1]
        sequence = self.sequence.translate(self.COMPLEMENT_TABLE)[::-1]
        return self.__class__(self.id, sequence, quality)


class DNARead(_NucleotideRead):
    """
    Hold information and methods to work with DNA reads.
    """

    ALPHABET: set = set("ATCG")

    COMPLEMENT_TABLE = _makeComplementTable(ambiguous_dna_complement)

    def findORF(
        self,
        offset: int,
        forward: bool = True,
        requireStartCodon: bool = True,
        allowGaps: bool = True,
        untranslatable: Optional[Dict[str, str]] = None,
    ):
        """
        Find an ORF that supposedly starts at a specified offset in a read.

        @param offset: The C{int} offset of the start codon.
        @param forward: If not C{True}, the reverse complement of the sequence
            should be examined.
        @param requireStartCodon: If C{True}, the first codon must be a start
            codon. If it is not, the search is abandoned immediately and the
            returned dictionary will have zero and C{False} values.
        @param allowGaps: If C{True}, gaps ('-') will be removed, else a
            ValueError is raised if there are any gaps in the region of
            C{self.sequence} that is to be translated.
        @param untranslatable: A C{dict} with C{str} keys and values. If any of
            the keys appears in a codon, the corresponding value is added to
            the translation. This can be used e.g., to make occurrences of '?'
            translate into '-' or 'X'.
        @return: A C{dict} with C{str} keys:
            length (int): the length of the ORF (in amino acids).
            foundStartCodon (bool): if a start codon was found.
            foundStopCodon (bool): if a stop codon was found.
            sequence (str): the ORF nucelotide sequence.
            translation (str): the amino acid sequence for the ORF.
        """
        sequence = self.sequence if forward else self.reverseComplement().sequence

        gapCount = sequence[offset:].count("-")
        if gapCount:
            if allowGaps:
                sequence = sequence[:offset] + sequence[offset:].replace("-", "")
            else:
                raise ValueError(
                    f"At least one gap ('-') character found in read "
                    f"{self.id!r} from offset {offset} or later."
                )

        first = True
        length = 0
        foundStartCodon = foundStopCodon = False
        codons = []
        translation = []

        for index in itertools.count(offset, 3):
            codon = sequence[index : index + 3]
            if len(codon) != 3:
                break

            if first:
                first = False
                if codon == START_CODON:
                    foundStartCodon = True
                elif requireStartCodon:
                    break

            if untranslatable:
                for char, replacement in untranslatable.items():
                    if char in codon:
                        translation.append(replacement)
                        break
                else:
                    translation.append(translate(codon))
            else:
                translation.append(translate(codon))

            length += 1
            codons.append(codon)

            if codon in STOP_CODONS:
                foundStopCodon = True
                break

        return {
            "length": length,
            "foundStartCodon": foundStartCodon,
            "foundStopCodon": foundStopCodon,
            "sequence": "".join(codons),
            "translation": "".join(translation),
        }


class RNARead(_NucleotideRead):
    """
    Hold information and methods to work with RNA reads.
    """

    ALPHABET: set = set("ATCGU")

    COMPLEMENT_TABLE = _makeComplementTable(ambiguous_rna_complement)


class DNAKozakRead(DNARead):
    """
    Hold information about a Kozak sequence.

    @param originalRead: The C{dark.reads.Read} instance in which the Kozak
        sequence was found.
    @param start: The C{int} start location of the Kozak sequence.
    @param stop: The C{int} stop location of the Kozak sequence (this is a
        Python string index, so the final Kozak sequence character is the one
        before this offset in the sequence.
    @param kozakQuality: A C{float}, giving the percentage of the 5 variable
        locations in the Kozak sequence that match the most frequent Kozak
        nucleotides.
    """

    def __init__(self, originalRead: Read, start: int, stop: int, kozakQuality: float):
        if start < 0:
            raise ValueError("start offset (%d) less than zero" % start)
        if stop > len(originalRead):
            raise ValueError(
                "stop offset (%d) > original read length (%d)"
                % (stop, len(originalRead))
            )
        if start > stop:
            raise ValueError(
                "start offset (%d) greater than stop offset (%d)" % (start, stop)
            )

        newId = "%s-(%d:%d)" % (originalRead.id, start, stop)

        if originalRead.quality:
            DNARead.__init__(
                self,
                newId,
                originalRead.sequence[start:stop],
                originalRead.quality[start:stop],
            )
        else:
            DNARead.__init__(self, newId, originalRead.sequence[start:stop])
        self.originalRead = originalRead
        self.start = start
        self.stop = stop
        self.kozakQuality = kozakQuality

    def __eq__(self, other: object):
        if isinstance(other, DNAKozakRead):
            return (
                self.id == other.id
                and self.sequence == other.sequence
                and self.originalRead == other.originalRead
                and self.quality == other.quality
                and self.start == other.start
                and self.stop == other.stop
                and self.kozakQuality == other.kozakQuality
            )
        else:
            return NotImplemented


class AARead(Read):
    """
    Hold information and methods to work with AA reads.
    """

    ALPHABET = set(AA_LETTERS)

    def checkAlphabet(self, count: int = 10) -> set:
        """
        A function which checks if an AA read really contains amino acids. This
        additional testing is needed, because the letters in the DNA alphabet
        are also in the AA alphabet.

        @param count: An C{int}, indicating how many bases or amino acids at
            the start of the sequence should be considered. If C{None}, all
            bases are checked.
        @return: A C{set} of the alphabet characters in the first C{count}
            positions of sequence is a subset of the allowed alphabet for this
            read class, or if the read class has a C{None} alphabet.
        @raise ValueError: If the sequence of self is more than 10 charactere
            and it looks like DNA has been passed to AARead().
        """
        readLetters = super().checkAlphabet(count)
        if len(self) > 10 and readLetters.issubset(set("ACGT")):
            raise ValueError(
                "It looks like a DNA sequence has been passed to AARead()."
            )
        return readLetters

    def properties(self) -> Generator[int, None, None]:
        """
        Translate an amino acid sequence to properties of the form:
        'F': HYDROPHOBIC | AROMATIC.

        @return: A generator yielding properties for the residues in the
            current sequence.
        """
        return (PROPERTIES.get(aa, NONE) for aa in self.sequence)

    def propertyDetails(self) -> Generator[Union[dict, int], None, None]:
        """
        Translate an amino acid sequence to properties. Each property of the
        amino acid gets a value scaled from -1 to 1.

        @return: A generator yielding property dictionaries.
        """
        return (PROPERTY_DETAILS.get(aa, NONE) for aa in self.sequence)

    def ORFs(self, openORFs: bool = False) -> Generator[AAReadORF, None, None]:
        """
        Find all ORFs in our sequence.

        @param openORFs: If C{True} allow ORFs that do not have a start codon
            and/or do not have a stop codon.
        @return: A generator that yields AAReadORF instances that correspond
            to the ORFs found in the AA sequence.
        """

        # Return open ORFs to the left and right and closed ORFs within the
        # sequence.
        if openORFs:
            ORFStart = 0
            inOpenORF = True  # open on the left
            inORF = False

            for index, residue in enumerate(self.sequence):
                if residue == "*":
                    if inOpenORF:
                        if index:
                            yield AAReadORF(self, ORFStart, index, True, False)
                        inOpenORF = False
                    elif inORF:
                        if ORFStart != index:
                            yield AAReadORF(self, ORFStart, index, False, False)
                        inORF = False
                elif residue == "M":
                    if not inOpenORF and not inORF:
                        ORFStart = index + 1
                        inORF = True

            # End of sequence. Yield the final ORF, open to the right, if
            # there is one and it has non-zero length.
            length = len(self.sequence)
            if inOpenORF and length > 0:
                yield AAReadORF(self, ORFStart, length, True, True)
            elif inORF and ORFStart < length:
                yield AAReadORF(self, ORFStart, length, False, True)

        # Return only closed ORFs.
        else:
            inORF = False

            for index, residue in enumerate(self.sequence):
                if residue == "M":
                    if not inORF:
                        inORF = True
                        ORFStart = index + 1
                elif residue == "*":
                    if inORF:
                        if ORFStart != index:
                            yield AAReadORF(self, ORFStart, index, False, False)
                        inORF = False


class AAReadWithX(AARead):
    """
    Hold information and methods to work with AA reads with additional
    characters.
    """

    ALPHABET: set[str] = set(AA_LETTERS + ["X"])


class AAReadORF(AARead):
    """
    Hold information about an ORF from an AA read.

    @param originalRead: The original L{AARead} instance in which this ORF
        occurs.
    @param start: The C{int} offset where the ORF starts in the original read.
    @param stop: The Python-style C{int} offset of the end of the ORF in the
        original read. The final index is not included in the ORF.
    @param openLeft: A C{bool}. If C{True}, the ORF potentially begins before
        the sequence given in C{sequence}. I.e., the ORF-detection code started
        to examine a read assuming it was already in an ORF. If C{False}, a
        start codon was found preceeding this ORF.
    @param openRight: A C{bool}. If C{True}, the ORF potentially ends after
        the sequence given in C{sequence}. I.e., the ORF-detection code
        was in an ORF when it encountered the end of a read (so no stop codon
        was found). If C{False}, a stop codon was found in the read after this
        ORF.
    """

    def __init__(
        self,
        originalRead: AARead,
        start: int,
        stop: int,
        openLeft: bool,
        openRight: bool,
    ):
        if start < 0:
            raise ValueError("start offset (%d) less than zero" % start)
        if stop > len(originalRead):
            raise ValueError(
                "stop offset (%d) > original read length (%d)"
                % (stop, len(originalRead))
            )
        if start > stop:
            raise ValueError(
                "start offset (%d) greater than stop offset (%d)" % (start, stop)
            )
        newId = "%s-%s%d:%d%s" % (
            originalRead.id,
            "(" if openLeft else "[",
            start,
            stop,
            ")" if openRight else "]",
        )
        if originalRead.quality:
            AARead.__init__(
                self,
                newId,
                originalRead.sequence[start:stop],
                originalRead.quality[start:stop],
            )
        else:
            AARead.__init__(self, newId, originalRead.sequence[start:stop])
        self.start = start
        self.stop = stop
        self.openLeft = openLeft
        self.openRight = openRight

    def toDict(self) -> dict:
        """
        Get information about this read in a dictionary.

        @return: A C{dict} with keys/values for the attributes of self.
        """
        result = super().toDict()

        result.update(
            {
                "start": self.start,
                "stop": self.stop,
                "openLeft": self.openLeft,
                "openRight": self.openRight,
            }
        )

        return result

    @classmethod
    def fromDict(cls, d: dict) -> AARead:
        """
        Create a new instance from attribute values provided in a dictionary.

        @param d: A C{dict} with keys/values for the attributes of a new
            instance of this class. Keys 'id' and 'sequence' with C{str} values
            must be provided. A 'quality' C{str} key is optional. Keys 'start'
            and 'stop' must have C{int} values. Keys 'openLeft' and 'openRight'
            are C{bool}, all keys are as described in the docstring for this
            class.
        @return: A new instance of this class, with values taken from C{d}.
        """
        # Make a dummy instance whose attributes we can set explicitly.
        new = cls(AARead("", ""), 0, 0, True, True)
        new.id = d["id"]
        new.sequence = d["sequence"]
        new.quality = d.get("quality")
        new.start = d["start"]
        new.stop = d["stop"]
        new.openLeft = d["openLeft"]
        new.openRight = d["openRight"]
        return new


class SSAARead(AARead):
    """
    Hold information to work with AAReads that have secondary structure
    information attached to them.

    Note that this class (currently) has no quality string associated with it.

    @param id: A C{str} describing the read.
    @param sequence: A C{str} of sequence information.
    @param structure: A C{str} of structure information.
    @raise ValueError: If the sequence and structure lengths are not the same.
    """

    def __init__(self, id, sequence: str, structure: str):
        super().__init__(id, sequence)
        self.structure = structure

        if len(sequence) != len(structure):
            raise ValueError(
                "Invalid read: sequence length (%d) != structure length (%d)"
                % (len(sequence), len(structure))
            )

    def __eq__(self, other: object) -> bool:
        if isinstance(other, SSAARead):
            return (
                self.id == other.id
                and self.sequence == other.sequence
                and self.structure == other.structure
            )
        else:
            return NotImplemented

    def __hash__(self) -> int:
        """
        Calculate a hash key for a read.

        @return: The C{int} hash key for the read.
        """
        return hash(
            md5(
                self.id.encode("UTF-8")
                + b"\0"
                + self.sequence.encode("UTF-8")
                + b"\0"
                + self.structure.encode("UTF-8")
            ).digest()
        )

    def __getitem__(self, item: Union[int, slice]) -> SSAARead:
        sequence = self.sequence[item]
        structure = self.structure[item] if self.structure else ""
        return self.__class__(self.id, sequence, structure)

    def toString(
        self, format_: str = "fasta-ss", structureSuffix: str = ":structure"
    ) -> str:
        """
        Convert the read to a string in PDB format (sequence & structure). This
        consists of two FASTA records, one for the sequence then one for the
        structure.

        @param format_: Either 'fasta-ss' or 'fasta'. In the former case, the
            structure information is returned. Otherwise, plain FASTA is
            returned.
        @param structureSuffix: The C{str} suffix to append to the read id
            for the second FASTA record, containing the structure information.
        @raise ValueError: If C{format_} is not 'fasta'.
        @return: A C{str} representing the read sequence and structure in PDB
            FASTA format.
        """
        if format_ == "fasta-ss":
            return ">%s\n%s\n>%s%s\n%s\n" % (
                self.id,
                self.sequence,
                self.id,
                structureSuffix,
                self.structure,
            )
        else:
            return super().toString(format_=format_)

    def toDict(self):
        """
        Get information about this read in a dictionary.

        @return: A C{dict} with keys/values for the attributes of self.
        """
        return {
            "id": self.id,
            "sequence": self.sequence,
            "structure": self.structure,
        }

    @classmethod
    def fromDict(cls, d):
        """
        Create a new instance from attribute values provided in a dictionary.

        @param d: A C{dict} with keys/values for the attributes of a new
            instance of this class. Keys 'id', 'sequence', and 'structure'
            with C{str} values must be provided.
        @return: A new instance of this class, with values taken from C{d}.
        """
        return cls(d["id"], d["sequence"], d["structure"])

    def newFromSites(self, sites, exclude=False):
        """
        Create a new read from self, with only certain sites.

        @param sites: A set of C{int} 0-based sites (i.e., indices) in
            sequences that should be kept. If C{None} (the default), all sites
            are kept.
        @param exclude: If C{True} the C{sites} will be excluded, not
            included.
        """
        if exclude:
            sites = set(range(len(self))) - sites

        newSequence = []
        newStructure = []
        for index, (base, structure) in enumerate(zip(self.sequence, self.structure)):
            if index in sites:
                newSequence.append(base)
                newStructure.append(structure)
        read = self.__class__(self.id, "".join(newSequence), "".join(newStructure))

        return read


class SSAAReadWithX(SSAARead):
    """
    Hold information and methods to work with C{SSAARead}s allowing 'X'
    characters to appear in sequences.
    """

    ALPHABET = set(AA_LETTERS + ["X"])


class TranslatedRead(AARead):
    """
    Hold information about one DNA->AA translation of a Read.

    @param originalRead: The original DNA or RNA L{Read} instance from which
        this translation was obtained.
    @param sequence: The C{str} AA translated sequence.
    @param frame: The C{int} frame, either 0, 1, or 2.
    @param reverseComplemented: A C{bool}, C{True} if the original sequence
        must be reverse complemented to obtain this AA sequence.
    """

    def __init__(self, originalRead, sequence, frame, reverseComplemented=False):
        if frame not in (0, 1, 2):
            raise ValueError("Frame must be 0, 1, or 2")
        newId = "%s-frame%d%s" % (
            originalRead.id,
            frame,
            "rc" if reverseComplemented else "",
        )
        AARead.__init__(self, newId, sequence)
        self.frame = frame
        self.reverseComplemented = reverseComplemented

    def __eq__(self, other):
        if not isinstance(other, TranslatedRead):
            return False
        return (
            AARead.__eq__(self, other)
            and self.frame == other.frame
            and self.reverseComplemented == other.reverseComplemented
        )

    def __ne__(self, other):
        return not self == other

    def toDict(self):
        """
        Get information about this read in a dictionary.

        @return: A C{dict} with keys/values for the attributes of self.
        """
        result = super().toDict()

        result.update(
            {
                "frame": self.frame,
                "reverseComplemented": self.reverseComplemented,
            }
        )

        return result

    @classmethod
    def fromDict(cls, d):
        """
        Create a new instance from attribute values provided in a dictionary.

        @param d: A C{dict} with keys/values for the attributes of a new
            instance of this class. Keys 'id' and 'sequence' with C{str} values
            must be provided. A 'quality' C{str} key is optional. Key 'frame'
            must have an C{int} value. Key 'reverseComplemented' must be a
            C{bool}, all keys are as described in the docstring for this class.
        @return: A new instance of this class, with values taken from C{d}.
        """
        # Make a dummy instance whose attributes we can set explicitly.
        new = cls(AARead("", ""), 0, True)
        new.id = d["id"]
        new.sequence = d["sequence"]
        new.quality = d.get("quality")
        new.frame = d["frame"]
        new.reverseComplemented = d["reverseComplemented"]
        return new

    def maximumORFLength(self, openORFs=True):
        """
        Return the length of the longest (possibly partial) ORF in a translated
        read. The ORF may originate or terminate outside the sequence, which is
        why the length is just a lower bound.
        """
        return max(len(orf) for orf in self.ORFs(openORFs))


class ReadFilter:
    """
    Create a function that can be used to filter a set of reads to produce a
    desired subset.

    Note: there are many additional filtering options that could be added,
    e.g., on complexity fraction, on GC %, on quality, etc.

    @param minLength: The minimum acceptable length.
    @param maxLength: The maximum acceptable length.
    @param maxNFraction: The maximum fraction of Ns that can be present in the
        sequence.
    @param removeGaps: If C{True} remove all gaps ('-' characters) from the
        read sequences.
    @param whitelist: If not C{None}, a set of exact read ids that are
        always acceptable (though other characteristics, such as length,
        of a whitelisted id may rule it out).
    @param blacklist: If not C{None}, a set of exact read ids that are
        never acceptable.
    @param whitelistFile: If not C{None}, a C{str} filename containing lines
        that give exact ids that are always acceptable.
    @param blacklistFile: If not C{None}, a C{str} filename containing lines
        that give exact ids that are never acceptable.
    @param titleRegex: A regex that read ids must match.
    @param negativeTitleRegex: A regex that read ids must not match.
    @param truncateTitlesAfter: A string that read ids will be truncated
        beyond. If the truncated version of an id has already been seen,
        that sequence will be skipped.
    @param keepSequences: Either C{None} or a set of C{int}s corresponding
        to reads that should be kept. Indexing starts at zero. The numbers
        refer to the sequential number of the sequence in the list of reads.
    @param removeSequences: Either C{None} or a set of C{int}s corresponding
        to reads that should be removed. Indexing starts at zero. The numbers
        refer to the sequential number of the sequence in the list of reads.
    @param head: If not C{None}, the C{int} number of sequences at the
        start of the reads to return. Later sequences are skipped.
    @param removeDuplicates: If C{True} remove duplicated reads based only on
        sequence identity.
    @param removeDuplicatesById: If C{True} remove duplicated reads based
        only on read id.
    @param removeDuplicatesUseMD5: If C{True}, use MD5 sums instead of the
        full sequence or read when either C{removeDuplicates} or
        C{removeDuplicatesById} are C{True}.
    @param removeDescriptions: If C{True} remove the description (the part
        following the first whitespace) from read ids. The description is
        removed after applying the function specified by --idLambda (if any).
    @param modifier: If not C{None}, a function that is passed a read
        and which either returns a read or C{None}. If it returns a read,
        that read is passed through the filter. If it returns C{None},
        the read is omitted. Such a function can be used to do customized
        filtering, to change sequence ids, etc.
    @param randomSubset: If not C{None}, an C{int} giving the number of
        sequences that should be returned. These will be selected at
        random, using an algorithm that does a single pass over the data
        and which needs to know (in advance) the total number of reads.
        Because a C{Reads} instance does not always know its length, it is
        possible to specify the number of reads by passing a C{trueLength}
        argument. Note that the random selection is done before any other
        filtering. Due to this, if you want to extract a random subset of
        the reads filtered in another way, it will be best to call filter
        twice rather than doing both types of filtering in one step. E.g.,
        you very likely should do this:
            reads.filter(maxLength=100).filter(randomSubset=20)
        rather than this:
            reads.filter(maxLength=100, randomSubset=20)
        The second version will extract a random subset of 20 reads and
        only return those that have length <= 100, so your result may have
        less than 20 reads. The former version extracts reads of the
        desired length and then takes 20 reads at random from that set, so
        you'll always get 20 reads in your result, assuming there are at
        least that many reads satisfying the length filter.
    @param trueLength: The C{int} number of reads in this C{Reads} instance.
        Under normal circumstances it will not be necessary to pass this
        argument. However in some cases a subclass (e.g., with
        C{dark.fasta.FastaReads}) does not know its length until its data
        has been read from disk. In such cases, it is not possible to
        choose a random subset without keeping the subset in memory (which
        is undesirable). See
        https://en.wikipedia.org/wiki/Reservoir_sampling for one approach
        when the set size is unknown. However, it is possible to filter a
        random subset in a single pass over the data without keeping the
        set in memory if the set size is known. C{trueLength} makes it
        possible to pass the actual number of reads (this will obviously
        need to be obtained via some other mechanism).
    @param sampleFraction: If not C{None}, a [0.0, 1.0] C{float} indicating
        a fraction of the reads that should be allowed to pass through the
        filter. The sample size will only be approximately the product of
        the C{sampleFraction} and the number of reads. The sample is taken
        at random. If you try to combine this filter with C{randomSubset}
        a C{ValueError} will be raised. If you need both filters, run them
        one after another.
    @param sequenceNumbersFile: If not C{None}, gives the C{str} name of a
        file containing (1-based) sequence numbers, in ascending order,
        one per line. Only those sequences matching the given numbers will
        be kept.
    @param idLambda: If not C{None}, a C{str} Python lambda function
        specification to use to modify read ids. The function is applied
        before removing the description (if --removeDescriptions is also
        specified).
    @param readLambda: If not C{None}, a C{str} Python lambda function
        specification to use to modify reads. The function will be passed,
        and must return, a single Read (or one of its subclasses). This
        function is called after the --idLambda function, if any.
    @param keepSites: A set of C{int} 0-based sites (i.e., indices) in
        sequences that should be kept. If C{None} (the default), all sites are
        kept.
    @param removeSites: A set of C{int} 0-based sites (i.e., indices) in
        sequences that should be removed. If C{None} (the default), no sites
        are removed.
    @param reverse: If C{True}, reverse the sequences. Reversing happens
        at a very late stage (i.e., after sites are altered via keepSites
        and removeSites).
    @param reverseComplement: If C{True}, replace seqeunces with their reverse
        complements. Reversing happens at a very late stage (i.e., after sites
        are altered via keepSites and removeSites).
    @param upper: Convert sequences to uppercase.
    @param lower: Convert sequences to lowercase.
    @param upperId: Convert sequence IDs to uppercase.
    @param lowerId: Convert sequence IDs to lowercase.
    @param rotate: Rotate the sequence an C{int} number of characters (to the
        right if > 0, else to the left).
    @raises ValueError: If C{randomSubset} and C{sampleFraction} are both
        specified, or if C{randomSubset} is specified but C{trueLength} is not,
        or if the sequence numbers in C{sequenceNumbersFile} are
        non-positive or not ascending, or if both C{keepSites} and
        C{removeSites} are given, or if both C{keepSequences} and
        C{removeSequences} are given.
    @raise AttributeError: If C{reverseComplement} is C{True} but the read
        type does not allow for reverse complementing.
    """

    # TODO, when/if needed: make it possible to pass a seed for the RNG
    # when randomSubset or sampleFraction are used. Also possible is to
    # save and restore the state of the RNG and/or to optionally add
    # 'seed=XXX' to the end of the id of the first read, etc.

    def __init__(
        self,
        minLength: Optional[int] = None,
        maxLength: Optional[int] = None,
        maxNFraction: Optional[float] = None,
        removeGaps: bool = False,
        whitelist: Optional[set[str]] = None,
        blacklist: Optional[set[str]] = None,
        whitelistFile: Optional[str] = None,
        blacklistFile: Optional[str] = None,
        titleRegex: Optional[str] = None,
        negativeTitleRegex: Optional[str] = None,
        truncateTitlesAfter: Optional[str] = None,
        sequenceWhitelist: Optional[set[str]] = None,
        sequenceBlacklist: Optional[set[str]] = None,
        sequenceWhitelistFile: Optional[str] = None,
        sequenceBlacklistFile: Optional[str] = None,
        sequenceRegex: Optional[str] = None,
        sequenceNegativeRegex: Optional[str] = None,
        keepSequences: Optional[set[int]] = None,
        removeSequences: Optional[set[int]] = None,
        head: Optional[int] = None,
        removeDuplicates: bool = False,
        removeDuplicatesById: bool = False,
        removeDuplicatesUseMD5: bool = False,
        removeDescriptions: bool = False,
        modifier: Optional[Callable[[Read], Union[Read, None]]] = None,
        randomSubset: Optional[int] = None,
        trueLength: Optional[int] = None,
        sampleFraction: Optional[float] = None,
        sequenceNumbersFile: Optional[str] = None,
        idLambda: Optional[str] = None,
        readLambda: Optional[str] = None,
        keepSites: Optional[set[int]] = None,
        removeSites: Optional[set[int]] = None,
        reverse: bool = False,
        reverseComplement: bool = False,
        upper: bool = False,
        lower: bool = False,
        upperId: bool = False,
        lowerId: bool = False,
        rotate: Optional[int] = None,
    ):
        if randomSubset is not None:
            if sampleFraction is not None:
                raise ValueError(
                    "randomSubset and sampleFraction cannot be "
                    "used simultaneously in a filter. Make two "
                    "read filters instead."
                )

            if trueLength is None:
                raise ValueError(
                    "trueLength must be supplied if randomSubset is specified."
                )

        self.minLength = minLength
        self.maxLength = maxLength
        self.maxNFraction = maxNFraction
        self.removeGaps = removeGaps
        self.head = head
        self.removeDescriptions = removeDescriptions
        self.modifier = modifier
        self.randomSubset = randomSubset
        self.trueLength = trueLength

        if removeDuplicatesUseMD5 and not (removeDuplicates or removeDuplicatesById):
            raise ValueError(
                "If you specify removeDuplicatesUseMD5, you need to also use "
                "one of removeDuplicates or removeDuplicatesById."
            )
        self.removeDuplicates = removeDuplicates
        self.removeDuplicatesById = removeDuplicatesById
        self.removeDuplicatesUseMD5 = removeDuplicatesUseMD5

        if keepSequences and removeSequences:
            raise ValueError(
                "Cannot simultaneously filter using keepSequences and "
                "removeSequences. Call filter twice in succession instead."
            )
        self.keepSequences = keepSequences
        self.removeSequences = removeSequences

        if keepSites and removeSites:
            raise ValueError(
                "Cannot simultaneously filter using keepSites and "
                "removeSites. Call filter twice in succession instead."
            )
        self.keepSites = keepSites
        self.removeSites = removeSites

        if reverseComplement:
            # Make sure reverse is not also set.
            reverse = False
        self.reverse = reverse
        self.reverseComplement = reverseComplement

        self.upper = upper
        self.lower = lower
        self.upperId = upperId
        self.lowerId = lowerId

        self.rotate = rotate

        self.alwaysFalse = False
        self.yieldCount = 0
        self.readIndex = -1

        self.titleFilter: Optional[TitleFilter] = None
        self.sequenceFilter: Optional[SequenceFilter] = None

        def _wantedSequences(filename):
            """
            Read and yield integer sequence numbers from a file.

            @raise ValueError: If the sequence numbers are not all positive or
                are not ascending.
            @return: A generator that yields C{int} sequence numbers.
            """
            with open(filename) as fp:
                lastNumber = None
                for line in fp:
                    n = int(line)
                    if lastNumber is None:
                        if n < 1:
                            raise ValueError(
                                "First line of sequence number file %r must "
                                "be at least 1." % filename
                            )
                        lastNumber = n
                        yield n
                    else:
                        if n > lastNumber:
                            lastNumber = n
                            yield n
                        else:
                            raise ValueError(
                                "Line number file %r contains non-ascending "
                                "numbers %d and %d." % (filename, lastNumber, n)
                            )

        self.wantedSequenceNumberGeneratorExhausted = False
        self.nextWantedSequenceNumber = None

        if sequenceNumbersFile is not None:
            self.wantedSequenceNumberGenerator = _wantedSequences(sequenceNumbersFile)
            try:
                self.nextWantedSequenceNumber = next(self.wantedSequenceNumberGenerator)
            except StopIteration:
                # There was a sequence number file, but it was empty. So no
                # reads will ever be accepted.
                self.alwaysFalse = True

        if (
            whitelist
            or blacklist
            or whitelistFile
            or blacklistFile
            or titleRegex
            or negativeTitleRegex
            or truncateTitlesAfter
        ):
            self.titleFilter = TitleFilter(
                whitelist=whitelist,
                blacklist=blacklist,
                whitelistFile=whitelistFile,
                blacklistFile=blacklistFile,
                positiveRegex=titleRegex,
                negativeRegex=negativeTitleRegex,
                truncateAfter=truncateTitlesAfter,
            )

        if (
            sequenceWhitelist
            or sequenceBlacklist
            or sequenceWhitelistFile
            or sequenceBlacklistFile
            or sequenceRegex
            or sequenceNegativeRegex
        ):
            self.sequenceFilter = SequenceFilter(
                whitelist=sequenceWhitelist,
                blacklist=sequenceBlacklist,
                whitelistFile=sequenceWhitelistFile,
                blacklistFile=sequenceBlacklistFile,
                positiveRegex=sequenceRegex,
                negativeRegex=sequenceNegativeRegex,
            )

        if removeDuplicates:
            self.sequencesSeen: set[str] = set()

        if removeDuplicatesById:
            self.idsSeen: set[str] = set()

        if sampleFraction is not None:
            if sampleFraction == 0.0:
                # The filter method should always return False.
                self.alwaysFalse = True
            elif sampleFraction == 1.0:
                # Passing 1.0 can be treated the same as passing no value.
                # This makes the filter code below simpler.
                sampleFraction = None
        self.sampleFraction = sampleFraction
        self.idLambda = eval(idLambda) if idLambda else None
        self.readLambda = eval(readLambda) if readLambda else None

    def filter(self, read):
        """
        Check if a read passes the filter.

        @param read: A C{Read} instance.
        @return: C{read} if C{read} passes the filter, C{False} if not.
        """
        self.readIndex += 1

        if self.alwaysFalse:
            return False

        if self.wantedSequenceNumberGeneratorExhausted:
            return False

        if self.nextWantedSequenceNumber is not None:
            if self.readIndex + 1 == self.nextWantedSequenceNumber:
                # We want this sequence.
                try:
                    self.nextWantedSequenceNumber = next(
                        self.wantedSequenceNumberGenerator
                    )
                except StopIteration:
                    # The sequence number iterator ran out of sequence
                    # numbers.  We must let the rest of the filtering
                    # continue for the current sequence in case we
                    # throw it out for other reasons (as we might have
                    # done for any of the earlier wanted sequence
                    # numbers).
                    self.wantedSequenceNumberGeneratorExhausted = True
            else:
                # This sequence isn't one of the ones that's wanted.
                return False

        if self.sampleFraction is not None and uniform(0.0, 1.0) > self.sampleFraction:
            # Note that we don't have to worry about the 0.0 or 1.0
            # cases in the above 'if', as they have been dealt with
            # in self.__init__.
            return False

        if self.randomSubset is not None:
            if self.yieldCount == self.randomSubset:
                # The random subset has already been fully returned.
                # There's no point in going any further through the input.
                self.alwaysFalse = True
                return False
            elif uniform(0.0, 1.0) > (
                (self.randomSubset - self.yieldCount)
                / (self.trueLength - self.readIndex)
            ):
                return False

        if self.head is not None and self.readIndex == self.head:
            # We're completely done.
            self.alwaysFalse = True
            return False

        if (self.minLength is not None and len(read) < self.minLength) or (
            self.maxLength is not None and len(read) > self.maxLength
        ):
            return False

        if self.maxNFraction is not None:
            nFraction = read.sequence.count("N") / len(read)
            if nFraction > self.maxNFraction:
                return False

        if self.titleFilter and self.titleFilter.accept(read.id) == TitleFilter.REJECT:
            return False

        if (
            self.sequenceFilter
            and self.sequenceFilter.accept(read.sequence) == SequenceFilter.REJECT
        ):
            return False

        if self.keepSequences is not None and self.readIndex not in self.keepSequences:
            return False

        if self.removeSequences is not None and self.readIndex in self.removeSequences:
            return False

        if self.removeDuplicates:
            if self.removeDuplicatesUseMD5:
                sequence = md5(read.sequence.encode("UTF-8")).digest()
            else:
                sequence = read.sequence
            if sequence in self.sequencesSeen:
                return False
            self.sequencesSeen.add(sequence)

        if self.removeDuplicatesById:
            if self.removeDuplicatesUseMD5:
                id_ = md5(read.id.encode("UTF-8")).digest()
            else:
                id_ = read.id
            if id_ in self.idsSeen:
                return False
            self.idsSeen.add(id_)

        # Only from here on do we start possibly modifying the read.

        if self.rotate:  # I.e., not None and not zero.
            read.rotate(self.rotate)

        if self.upper:
            read.sequence = read.sequence.upper()

        elif self.lower:
            read.sequence = read.sequence.lower()

        if self.upperId:
            read.id = read.id.upper()

        elif self.lowerId:
            read.id = read.id.lower()

        if self.removeGaps:
            if read.quality is None:
                read = read.__class__(read.id, read.sequence.replace("-", ""))
            else:
                newSequence = []
                newQuality = []
                for base, quality in zip(read.sequence, read.quality):
                    if base != "-":
                        newSequence.append(base)
                        newQuality.append(quality)
                read = read.__class__(
                    read.id, "".join(newSequence), "".join(newQuality)
                )

        if self.modifier:
            modified = self.modifier(read)
            if modified is None:
                return False
            else:
                read = modified

        # We have to use 'is not None' in the following tests so the empty set
        # is processed properly.
        if self.keepSites is not None:
            read = read.newFromSites(self.keepSites)
        elif self.removeSites is not None:
            read = read.newFromSites(self.removeSites, exclude=True)

        if self.idLambda:
            newId = self.idLambda(read.id)
            if newId is None:
                return False
            else:
                read.id = newId

        if self.readLambda:
            newRead = self.readLambda(read)
            if newRead is None:
                return False
            else:
                read = newRead

        if self.removeDescriptions:
            read.id = read.id.split()[0]

        if self.reverse:
            read = read.reverse()
        elif self.reverseComplement:
            read = read.reverseComplement()

        self.yieldCount += 1
        return read


# Provide a mapping from all read class names to read classes. This can be
# useful in deserialization.
readClassNameToClass: dict[str, type[Read]] = {
    "AARead": AARead,
    "AAReadORF": AAReadORF,
    "AAReadWithX": AAReadWithX,
    "DNARead": DNARead,
    "RNARead": RNARead,
    "Read": Read,
    "SSAARead": SSAARead,
    "SSAAReadWithX": SSAAReadWithX,
    "TranslatedRead": TranslatedRead,
}


def getUnambiguousBases() -> dict[str, set[str]]:
    _DNA = set("ACGT")
    _RNA = set("ACGU")
    _AA = set(AA_LETTERS)

    # Map read class names to the set of unambiguous sequence bases (loosely
    # speaking).
    return {
        "AARead": _AA,
        "AAReadORF": _AA,
        "AAReadWithX": _AA,
        "DNARead": _DNA,
        "RNARead": _RNA,
        "Read": _DNA,
        "SSAARead": _AA,
        "SSAAReadWithX": _AA,
        "TranslatedRead": _AA,
    }


class Reads:
    """
    Maintain a collection of sequence reads.

    @param initialReads: If not C{None}, an iterable of C{Read} (or C{Read}
        subclass) instances.
    """

    def __init__(self, initialReads: Optional[Iterable[Read]] = None) -> None:
        self._initialReads = initialReads
        self._additionalReads: List[Read] = []
        self._filters: List[Callable] = []
        self._iterated = False

    def filterRead(self, read: Read) -> Union[Literal[False], Read]:
        """
        Filter a read, according to our set of filters.

        @param read: A C{Read} instance or one of its subclasses.
        @return: C{False} if the read fails any of our filters, else the
            C{Read} instance returned by our list of filters.
        """
        for filterFunc in self._filters:
            filteredRead = filterFunc(read)
            if filteredRead is False:
                return False
            else:
                read = filteredRead
        return read

    def add(self, read: Read) -> None:
        """
        Add a read to this collection of reads.

        @param read: A C{Read} instance.
        """
        self._additionalReads.append(read)

    def __iter__(self) -> Generator[Read, None, None]:
        """
        Iterate through all the reads.

        @return: A generator that yields reads. The returned read types depend
            on the kind of reads that were added to this instance.
        """

        for read in self._additionalReads:
            filteredRead = self.filterRead(read)
            if filteredRead is not False:
                yield filteredRead

        _unfilteredLength = len(self._additionalReads)

        # self._initialReads may be a Reads instance and/or may not support
        # len().
        initialReads = self._initialReads or []
        initialReadsLength = 0

        for read in initialReads:
            initialReadsLength += 1
            filteredRead = self.filterRead(read)
            if filteredRead is not False:
                yield filteredRead

        if isinstance(initialReads, Reads):
            _unfilteredLength += initialReads.unfilteredLength()
        else:
            _unfilteredLength += initialReadsLength

        # The value returned by self.iter() may be a Reads instance and/or
        # may not support len().
        subclassReads: Iterable[Read] = self.iter()
        subclassReadsLength = 0
        for xread in subclassReads:
            subclassReadsLength += 1
            filteredRead = self.filterRead(xread)
            if filteredRead is not False:
                yield filteredRead

        if isinstance(subclassReads, Reads):
            _unfilteredLength += subclassReads.unfilteredLength()
        else:
            _unfilteredLength += subclassReadsLength

        self._unfilteredLength = _unfilteredLength
        self._iterated = True

    def unfilteredLength(self) -> int:
        """
        Return the underlying number of reads in C{self}, irrespective of any
        filtering that has been applied.

        To obtain the number of reads in a filtered C{Reads} instance, you
        must count the reads yourself as you iterate it.

        @raises RuntimeError: If C{self} has not been fully iterated.
        @return: The C{int} number of reads in C{self}.
        """
        if self._iterated:
            return self._unfilteredLength
        else:
            raise RuntimeError(
                "The unfiltered length of a Reads instance is unknown until "
                "it has been iterated."
            )

    def iter(self) -> Union[Generator[Read, None, None], list]:
        """
        Placeholder to allow subclasses to provide reads.

        These might be extracted from a file. E.g., the
        C{dark.reads.fasta.FastaReads} class (a subclass of C{Reads})
        overrides this method to provide reads from a file.

        @return: An iterable of C{Read} instances.
        """
        return []

    def save(
        self, filename: Union[str, TextIO], format_: str = "fasta", mode: str = "w"
    ) -> int:
        """
        Write the reads to C{filename} in the requested format.

        @param filename: Either a C{str} file name to save into (the file will
            be overwritten) or an open file descriptor (e.g., sys.stdout).
        @param format_: A C{str} format to save as, either 'fasta', 'fastq' or
            'fasta-ss'.
        @param mode: The mode to open the file with (if C{filename}) is not already
            an open file.
        @raise ValueError: if C{format_} is 'fastq' and a read with no quality
            is present, or if an unknown format is requested.
        @return: An C{int} giving the number of reads in C{self}.
        """
        format_ = format_.lower()
        count = 0

        if isinstance(filename, (str, Path)):
            try:
                with open(filename, mode) as fp:
                    for read in iter(self):
                        fp.write(read.toString(format_))
                        count += 1
            except ValueError:
                os.unlink(filename)
                raise
        else:
            # We have a file-like object.
            for read in iter(self):
                filename.write(read.toString(format_))
                count += 1
        return count

    def filter(self, **kwargs) -> Reads:
        """
        Add a filter to this C{Reads} instance.

        @param kwargs: Keyword arguments, as accepted by C{ReadFilter}.
        @return: C{self}.
        """
        readFilter = ReadFilter(**kwargs)
        self._filters.append(readFilter.filter)
        return self

    def clearFilters(self) -> Reads:
        """
        Clear all filters on this C{Reads} instance.

        @return: C{self}.
        """
        self._filters = []
        return self

    def summarizePosition(self, index: int) -> dict:
        """
        Compute residue counts at a specific sequence index.

        @param index: an C{int} index into the sequence.
        @return: A C{dict} with the count of the reads, the too-short (excluded)
            sequences, and a Counter instance giving the base/residue counts.
        """
        countAtPosition: Counter = Counter()
        excludedCount = readCount = 0

        for read in iter(self):
            readCount += 1
            try:
                countAtPosition[read.sequence[index]] += 1
            except IndexError:
                excludedCount += 1

        return {
            "excludedCount": excludedCount,
            "countAtPosition": countAtPosition,
            "readCount": readCount,
        }

    def sitesMatching(self, targets: set, matchCase: bool, any_: bool) -> set[int]:
        """
        Find sites (i.e., sequence indices) that match a given set of target
        sequence bases.

        @param targets: A C{set} of sequence bases to look for.
        @param matchCase: If C{True}, case will be considered in matching.
        @param any_: If C{True}, return sites that match in any read. Else
            return sites that match in all reads.
        @return: A C{set} of 0-based C{int} sites that indicate where the target
            bases occur in our reads. An index will be in this set if any of
            our reads has any of the target bases in that location.
        """
        # If case is unimportant, we convert everything (target bases and
        # sequences, as we read them) to lower case.
        if not matchCase:
            targets = set(map(str.lower, targets))

        # result = set() if any_ else None
        if any_:
            result: set[int] = set()

        # I am using a 'first' variable here instead of just setting
        # 'result' to be None and testing that below, because if I let it
        # be None then mypy complains it is None when I try to add to it.
        first = True

        for read in iter(self):
            sequence = read.sequence if matchCase else read.sequence.lower()
            matches = set(
                index for (index, base) in enumerate(sequence) if base in targets
            )
            if any_:
                result |= matches
            else:
                if first:
                    first = False
                    result = matches
                else:
                    result &= matches
                # We can exit early if we run out of possible sites.
                if not result:
                    break

        # Make sure we don't return None.
        # return result or set()
        return set() if result is None else result

    def variableSites(
        self,
        confirm: bool = False,
        homogeneityLevel: float = 1.0,
        unknownAreAmbiguous: bool = False,
    ) -> dict:
        """
        Find the variable sites in a set of reads.

        @param confirm: If C{True} only return sites where there is confirm
            variation (i.e., ambiguous sites that are compatible with there
            being no variation are not returned).
        @homogeneityLevel: If the frequency of the most-common nucleotide at
            a site is at least this value, the site will be considered
            homogeneous.
        @param unknownAreAmbiguous: If C{True}, any unknown character (e.g., a
            '-' gap or '?' unknown base) will be treated as being fully
            ambiguous (i.e., could be any of ACGT). Otherwise, all unknown
            characters are collected under the count for '-'.
        @return: A C{dict} keyed by C{int} site number (0-based) with values
            that are C{FloatBaseCounts} instances giving the base counts at
            the site.
        """
        reads = list(self)
        varSites = {}

        if reads:
            length = len(reads[0].sequence)
            if not all(len(read.sequence) == length for read in reads):
                raise ReadLengthsNotIdenticalError()
            for site in range(length):
                counts = FloatBaseCounts(
                    [r.sequence[site] for r in reads],
                    unknownAreAmbiguous=unknownAreAmbiguous,
                )
                if counts.variable(confirm) and not counts.homogeneous(
                    homogeneityLevel
                ):
                    varSites[site] = counts

        return varSites

    def combineReads(self) -> str:
        """
        Combine all reads into a single read. Reads must be of equal length.
        This is "combine" in the sense of making a single consensus sequence
        but without considering the frequency of the bases at each site.

        @return: a C{str} sequence made from combining all reads.
        """
        reads = list(self)
        assert len({len(read) for read in reads}) == 1

        sequence = ""
        for site in range(len(reads[0])):
            bases = set([r.sequence[site] for r in reads])
            if len(bases) == 1:
                sequence += bases.pop()
            elif (
                len(bases) == 2
                and "N" in bases
                and bases.intersection({"A", "T", "G", "C"})
            ):
                sequence += list(bases.intersection({"A", "T", "G", "C"}))[0]
            else:
                nucleotides = set()
                for base in bases:
                    nucleotides.update(AMBIGUOUS.get(base, set()))
                try:
                    sequence += BASES_TO_AMBIGUOUS["".join(sorted(nucleotides))]
                except KeyError:
                    raise ValueError(
                        "Unknown DNA base(s): %r" % (nucleotides - set("ACGTN-"))
                    )

        return sequence

    def temporalBaseCounts(
        self,
        firstPostId: str,
        minFrequency: Optional[float] = None,
        maxFrequency: Optional[float] = None,
        minCount: int = 0,
        preIds: Optional[Set[str]] = None,
    ):
        """
        Iterate through time-sorted reads, accumulating counts of bases at each
        offset pre- and post- a specific sequence.

        @param firstPostId: The C{str} id of the first member of the 'post'
            sequences.
        @param minFrequency: The C{float} minimum frequency at which a new base
            is considered interesting and should have its frequency returned.
        @param maxFrequency: The C{float} maximum frequency at which a new base
            is considered interesting and should have its frequency returned.
        @param minCount: The C{int} minimal number of times a new base must be
            seen to be considered interesting (and to therefore have its
            frequency returned).
        @param preIds: If not C{None}, a C{set} of C{str} ids to include in the
            the early counting (i.e., before seeing firstPostId). If C{None},
            the sequences of all early ids will be included.
        @return: A C{dict}, as below.
        """
        first = True
        preBases: dict[int, dict[str, int]] = {}
        postBases: dict[int, dict[str, int]] = {}
        preCount = postCount = 0
        reference = None
        postIdFound = False
        preIdsFound = set()
        _DNA = set("ACGT")

        for genome in iter(self):
            if first:
                first = False
                length = len(genome)
                reference = genome
                for offset in range(length):
                    preBases[offset] = defaultdict(int)
                    postBases[offset] = defaultdict(int)
            else:
                if len(genome) != length:
                    raise ValueError(
                        f"Genome {genome.id!r} has length {len(genome)} which "
                        f"does not match the length of the first input "
                        f"sequence ({length})."
                    )

            if genome.id == firstPostId:
                if postIdFound:
                    raise ValueError(
                        f"Delimiting sequence id {firstPostId!r} found more than once!"
                    )
                postIdFound = True

            if postIdFound:
                bases = postBases
                postCount += 1
            else:
                if preIds:
                    if genome.id not in preIds:
                        continue
                    else:
                        if genome.id in preIdsFound:
                            raise ValueError(
                                f"Pre-id sequence {genome.id!r} found more than once!"
                            )
                        preIdsFound.add(genome.id)

                bases = preBases
                preCount += 1

            for offset, base in enumerate(genome.sequence):
                # TODO: Deal with ambiguous codes instead of ignoring them.
                if base in _DNA:
                    bases[offset][base] += 1

        if reference is None:
            raise ValueError("No genomes found.")

        if not postIdFound:
            raise ValueError(
                f"The delimiting sequence id {firstPostId!r} was not found."
            )

        if preIds and preIds != preIdsFound:
            missing = sorted(preIds - preIdsFound)
            if len(missing) == 1:
                raise ValueError(f"Pre-id {missing[0]!r} not found.")
            else:
                raise ValueError(
                    f"{len(missing)} pre-ids ({', '.join(missing)}) was not found."
                )

        # Look for bases (that occurred in the post-sequences) that are new
        # (i.e., previously unseen). Calculate their frequencies, and
        # record frequencies that are in the wanted range.
        newFrequencies: dict[int, dict[str, float]] = defaultdict(dict)
        for offset in range(length):
            newBasesInPost = set(postBases[offset]) - set(preBases[offset])
            if newBasesInPost:
                for newBase in newBasesInPost:
                    baseCount = postBases[offset][newBase]
                    if baseCount > minCount:
                        frq = baseCount / postCount
                        if (minFrequency is None or frq >= minFrequency) and (
                            maxFrequency is None or frq <= maxFrequency
                        ):
                            newFrequencies[offset][newBase] = frq

        return {
            "reference": reference,
            "pre": {
                "bases": preBases,
                "count": preCount,
            },
            "post": {
                "bases": postBases,
                "count": postCount,
                "new": newFrequencies,
            },
        }

    def getPrefixAndSuffixOffsetsForId(
        self,
        id_: str | None,
        prefix: str | None,
        suffix: str | None,
        ignoreGaps: bool = False,
        gapCharacter: str = "-",
    ) -> tuple[tuple[int, int], Read]:
        """
        Find the prefix (start) and suffix (end) in a read with id 'id_'.

        Return a 2-tuple containing 1) a 2-tuple with (start, end) offsets of the prefix
        and suffix start and end, respectively, and 2) the read in question.
        """

        for read in self:
            if read.id == id_:
                return (
                    read.getPrefixAndSuffixOffsets(
                        prefix, suffix, ignoreGaps=ignoreGaps, gapCharacter=gapCharacter
                    ),
                    read,
                )

        raise ValueError(f"No sequence with id {id_!r} found.")

    def getPrefixAndSuffixOffsets(
        self,
        prefix: str | None,
        suffix: str | None,
        ignoreGaps: bool = False,
        gapCharacter: str = "-",
    ) -> tuple[tuple[int, int], list[tuple[tuple[int, int], Read]]]:
        """
        Find a prefix (start) and/or a suffix (end) in a collection of reads.

        Return a tuple with the start-of-prefix and end-of-suffix offsets, and a
        list containing ((start, end), Read) tuples with the offsets in each read.
        The list in the results is for testing and for our caller in case they want
        to print a summary of match information.
        """
        offsetInfo = [
            (
                read.getPrefixAndSuffixOffsets(
                    prefix, suffix, ignoreGaps=ignoreGaps, gapCharacter=gapCharacter
                ),
                read,
            )
            for read in self
        ]

        if not offsetInfo:
            raise ValueError("No input sequences were given.")

        start = end = -1
        startRead = endRead = None

        for (thisStart, thisEnd), read in offsetInfo:
            # Remember the first read where we managed to find the prefix and suffix
            # so we can produce a helpful error message if we later find either at a
            # different offset.
            if thisStart != -1 and startRead is None:
                startRead = read
            if thisEnd != -1 and endRead is None:
                endRead = read

            if start == -1:
                # We haven't yet found a sequence that has the prefix.
                start = thisStart
            else:
                if thisStart != -1 and thisStart != start:
                    # If start != -1 there must have been a prefix and a start read.
                    assert prefix
                    assert startRead
                    raise ValueError(
                        f"Conflict: prefix {prefix!r} was found at offset {start} in "
                        f"{startRead.id!r} but at offset {thisStart} in {read.id!r}."
                    )

            if end == -1:
                # We haven't yet found a sequence that has the suffix.
                end = thisEnd
            else:
                if thisEnd != -1 and thisEnd != end:
                    # If end != -1 there must have been a suffix and a end read.
                    assert suffix
                    assert endRead
                    raise ValueError(
                        f"Conflict: suffix {suffix!r} was found ending at offset "
                        f"{end} in {endRead.id!r} but ending at offset {thisEnd} in "
                        f"{read.id!r}."
                    )

        return (start, end), offsetInfo

    def extractRegion(
        self,
        id_: str | None,
        prefix: str | None,
        suffix: str | None,
        ignoreGaps: bool = False,
        gapCharacter: str = "-",
        allowUnequalLengths: bool = False,
    ) -> tuple[ReadsInRAM, tuple[int, int], list[tuple[tuple[int, int], Read]]]:
        """
        Extract a region from all the (usually aligned) reads in self.

        Returns a 3-tuple containing:

          1) A ReadsInRAM instance, containing all the sequences in self trimmed
             according to the location of the start of the prefix and end of the
             suffix. If no prefix is given, the sequences will start from their
             beginning, and if no suffix is given they will end at their end.
          2) A 2-tuple with the int start offset of the prefix (or -1 if no prefix
             is given) and the offset of the character after the end of the
             suffix (or -1 if no suffix is given).
          3) A list of details of the offsets of the start-of-prefix and end-of-suffix
             matches (or -1 if there was no match). The list elements are 3-tuples,
             with (start, end, read). If an ID is passed in id_, the list will
             only contain one element, for that read. If no ID is passed, the list
             will have prefix/suffix offsets for all reads.
        """

        if not (prefix or suffix):
            raise ValueError("Neither a prefix nor a suffix was specified.")

        if not allowUnequalLengths:
            lengths = set(len(read) for read in self)
            if len(lengths) != 1:
                raise ReadLengthsNotIdenticalError(
                    "All sequences must be the same length, unless "
                    "allowUnequalLengths is true. Found lengths "
                    + ", ".join(map(str, sorted(lengths)))
                    + "."
                )

        if id_:
            (start, end), read = self.getPrefixAndSuffixOffsetsForId(
                id_, prefix, suffix, ignoreGaps=ignoreGaps, gapCharacter=gapCharacter
            )
            offsetInfo = [((start, end), read)]
        else:
            (start, end), offsetInfo = self.getPrefixAndSuffixOffsets(
                prefix, suffix, ignoreGaps=ignoreGaps, gapCharacter=gapCharacter
            )

        if start == -1:
            if end == -1:
                if prefix:
                    if suffix:
                        what = "prefix and suffix were"
                    else:
                        what = "prefix was"
                else:
                    assert suffix  # This is tested above.
                    what = "suffix was"
                raise ValueError(f"The {what} not matched by any sequence.")
            else:
                result = ReadsInRAM(read[:end] for read in self)
        elif end == -1:
            result = ReadsInRAM(read[start:] for read in self)
        else:
            # This assert is kind-of obvious, but maybe makes the code more readable.
            assert start != -1 and end != -1
            result = ReadsInRAM(read[start:end] for read in self)

        return result, (start, end), offsetInfo


class ReadsInRAM(Reads):
    """
    Maintain a collection of sequence reads in RAM.

    @param initialReads: If not C{None}, an iterable of C{Read} (or a C{Read}
        subclass) instances.
    """

    # This class provides some C{list} like methods (len and indexing) but
    # is not an actual list or list subclass. That's because we want to inherit
    # the methods of C{Reads}, and I considered it too messy to use double
    # inheritance. If you want a real list, you can just call C{list} on a
    # C{Reads} or C{ReadsInRAM} instance.

    def __init__(self, initialReads: Optional[Iterable[Read]] = None):
        super().__init__(initialReads)

        # Read all initial reads into memory.
        if initialReads:
            for read in initialReads:
                self.add(read)

        # Set self._iterated to True in case someone calls unfilteredLength
        # (see Reads).
        self._iterated = True

    def __len__(self) -> int:
        return self._additionalReads.__len__()

    def __getitem__(self, item: SupportsIndex) -> Union[Read, Reads]:
        return self._additionalReads.__getitem__(item)

    def __setitem__(self, item: SupportsIndex, value: Read) -> None:
        return self._additionalReads.__setitem__(item, value)

    def __iter__(self) -> Generator[Read, None, None]:
        for read in self._additionalReads.__iter__():
            yield read


def addFASTACommandLineOptions(parser: argparse.ArgumentParser) -> None:
    """
    Add standard command-line options to an argparse parser.

    @param parser: An C{argparse.ArgumentParser} instance.
    """

    parser.add_argument(
        "--fastaFile",
        type=open,
        default=sys.stdin,
        metavar="FILENAME",
        help=(
            "The name of the FASTA input file. Standard input will be read "
            "if no file name is given."
        ),
    )

    parser.add_argument(
        "--readClass",
        default="DNARead",
        choices=readClassNameToClass,
        metavar="CLASSNAME",
        help=(
            "If specified, give the type of the reads in the input. "
            "Possible choices: %s." % ", ".join(readClassNameToClass)
        ),
    )

    # A mutually exclusive group for either --fasta, --fastq, or --fasta-ss
    group = parser.add_mutually_exclusive_group()

    group.add_argument(
        "--fasta",
        action="store_true",
        help="If specified, input will be treated as FASTA. This is the default.",
    )

    group.add_argument(
        "--fastq",
        action="store_true",
        help="If specified, input will be treated as FASTQ.",
    )

    group.add_argument(
        "--fasta-ss",
        dest="fasta_ss",
        action="store_true",
        help=(
            "If specified, input will be treated as PDB FASTA "
            "(i.e., regular FASTA with each sequence followed by its "
            "structure)."
        ),
    )


def parseFASTACommandLineOptions(args: argparse.Namespace) -> Reads:
    """
    Examine parsed command-line options and return a Reads instance.

    @param args: An argparse namespace, as returned by the argparse
        C{parse_args} function.
    @return: A C{Reads} subclass instance, depending on the type of FASTA file
        given.
    """
    # Set default FASTA type.
    if not (args.fasta or args.fastq or args.fasta_ss):
        args.fasta = True

    readClass = readClassNameToClass[args.readClass]

    if args.fasta:
        from dark.fasta import FastaReads

        return FastaReads(args.fastaFile, readClass=readClass)
    elif args.fastq:
        from dark.fastq import FastqReads

        return FastqReads(args.fastaFile, readClass=readClass)
    else:
        from dark.fasta_ss import SSFastaReads

        return SSFastaReads(args.fastaFile, readClass=readClass)


def getNoCoverageCounts(
    reads: Iterable[Read], noCoverageChars: Optional[str]
) -> dict[str, int]:
    """
    Get the no-coverage character counts for all reads.

    @param reads: A C{Reads} instance.
    @param noCoverageChars: A C{str} of sequence characters that indicate
        no coverage. If empty or C{None}, it is assumed there are no no-coverage
        characters, so the count for all reads will be zero.
    @return: A C{dict} keyed by read id, with C{int} number of no-coverage
        characters in the read sequence.
    """
    if noCoverageChars is None:
        result = dict.fromkeys((read.id for read in reads), 0)
    else:
        noCoverageCharsSet = set(noCoverageChars)
        result = {}
        for read in reads:
            result[read.id] = sum(
                character in noCoverageCharsSet for character in read.sequence
            )

    return result


def simpleReadSplitter(
    length: int,
    leftPrefix: str = "",
    leftSuffix: str = "",
    rightPrefix: str = "",
    rightSuffix: str = "",
) -> Callable[[Read], Iterable[Read]]:
    """
    Return a function that splits reads above a certain length into left and
    right fragments.

    @param length: The C{int} read length, above which reads are to be split.
    @param leftPrefix: The C{str} id prefix for left fragments.
    @param leftSuffix: The C{str} id suffix for left fragments.
    @param rightPrefix: The C{str} id prefix for right fragments.
    @param rightSuffix: The C{str} id suffix for right fragments.
    """

    def splitter(read: Read) -> Iterable[Read]:
        """
        Split sufficiently long reads into left and right fragments and yield both.
        Simply yield the original read if it is too short to be split.

        @para read: The C{Read} to split.
        @return: A generator that yields either one (for reads not longer than
            C{length}) or two reads. If two fragments are yielded, their ids will
            be the id of the original read, each with a left or right suffix and prefix.
        """
        if len(read) > length:
            left = read[:length]
            left.id = leftPrefix + read.id + leftSuffix
            yield left

            right = read[length:]
            right.id = rightPrefix + read.id + rightSuffix
            yield right
        else:
            yield read

    return splitter
