diff --git a/benchmarks/sequence/__init__.py b/benchmarks/sequence/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/benchmarks/sequence/align/__init__.py b/benchmarks/sequence/align/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/benchmarks/sequence/align/benchmark_kmers.py b/benchmarks/sequence/align/benchmark_kmers.py new file mode 100644 index 000000000..bd02ffe9d --- /dev/null +++ b/benchmarks/sequence/align/benchmark_kmers.py @@ -0,0 +1,168 @@ +import functools +import pickle +import numpy as np +import pytest +import biotite.sequence as seq +import biotite.sequence.align as align + + +class FixedBucketKmerTable: + """ + A wrapper around :class:`BucketKmerTable` with a fixed number of + buckets. + This allows test functions to call static functions from + :class:`KmerTable` and :class:`FixedBinnedKmerTable` with the same + signature, avoiding if-else constructs. + """ + + def __init__(self, n_buckets): + self._n_buckets = n_buckets + + def __getattr__(self, name): + attr = getattr(align.BucketKmerTable, name) + if attr.__name__ in ["from_sequences", "from_kmers", "from_kmer_selection"]: + return functools.partial(attr, n_buckets=self._n_buckets) + else: + return attr + + def __repr__(self): + return f"BucketKmerTable({self._n_buckets})" + + +def idfn(val): + if isinstance(val, FixedBucketKmerTable): + return repr(val) + + +@pytest.fixture(scope="module", params=[None, "11*11*1*1***111"]) +def kmer_alphabet(request): + return align.KmerAlphabet( + seq.NucleotideSequence.unambiguous_alphabet(), k=9, spacing=request.param + ) + + +@pytest.fixture(scope="module") +def seq_code(): + LENGTH = 1000 + + rng = np.random.default_rng(0) + return rng.integers( + len(seq.NucleotideSequence.unambiguous_alphabet()), size=LENGTH, dtype=np.uint8 + ) + + +@pytest.fixture(scope="module") +def kmer_code(kmer_alphabet, seq_code): + return kmer_alphabet.create_kmers(seq_code) + + +@pytest.fixture( + scope="module", params=[align.KmerTable, FixedBucketKmerTable(10000)], ids=idfn +) +def kmer_table(kmer_alphabet, request): + N_SEQUENCES = 100 + LENGTH = 1000 + + Table = request.param + + rng = np.random.default_rng(0) + seq_codes = [ + rng.integers( + len(seq.NucleotideSequence.unambiguous_alphabet()), + size=LENGTH, + dtype=np.uint8, + ) + for _ in range(N_SEQUENCES) + ] + + kmers = [kmer_alphabet.create_kmers(seq_code) for seq_code in seq_codes] + return Table.from_kmers(kmer_alphabet, kmers) + + +@pytest.mark.benchmark +def benchmark_kmer_decomposition(kmer_alphabet, seq_code): + kmer_alphabet.create_kmers(seq_code) + + +@pytest.mark.parametrize( + "Table", + [ + align.KmerTable, + FixedBucketKmerTable(100000), + ], + ids=idfn, +) +@pytest.mark.benchmark +def benchmark_indexing_from_sequences(kmer_alphabet, seq_code, Table): + N_SEQUENCES = 100 + + sequence = seq.NucleotideSequence() + sequence.code = seq_code + sequences = [sequence] * N_SEQUENCES + Table.from_sequences(kmer_alphabet.k, sequences, spacing=kmer_alphabet.spacing) + + +@pytest.mark.parametrize( + "Table", + [ + align.KmerTable, + FixedBucketKmerTable(100000), + ], + ids=idfn, +) +@pytest.mark.benchmark +def benchmark_indexing_from_kmers(kmer_alphabet, seq_code, Table): + N_SEQUENCES = 100 + + kmers = [kmer_alphabet.create_kmers(seq_code)] * N_SEQUENCES + Table.from_kmers(kmer_alphabet, kmers) + + +@pytest.mark.parametrize( + "Table", + [ + align.KmerTable, + FixedBucketKmerTable(100000), + ], + ids=idfn, +) +@pytest.mark.benchmark +def benchmark_indexing_from_kmer_selection(kmer_alphabet, kmer_code, Table): + N_SEQUENCES = 100 + + kmers = [kmer_code] * N_SEQUENCES + positions = [np.arange(len(kmer_code), dtype=np.uint32)] * N_SEQUENCES + Table.from_kmer_selection(kmer_alphabet, positions, kmers) + + +@pytest.mark.benchmark +def benchmark_match(seq_code, kmer_table): + sequence = seq.NucleotideSequence() + sequence.code = seq_code + kmer_table.match(sequence) + + +@pytest.mark.benchmark +def benchmark_match_kmer_selection(kmer_code, kmer_table): + positions = np.arange(len(kmer_code), dtype=np.uint32) + kmer_table.match_kmer_selection(positions, kmer_code) + + +@pytest.mark.benchmark +def benchmark_match_table(kmer_table): + kmer_table.match_table(kmer_table) + + +@pytest.mark.benchmark +def test_pickle_and_unpickle(kmer_table): + pickle.loads(pickle.dumps(kmer_table)) + + +@pytest.mark.benchmark +def benchmark_score_threshold_rule(kmer_alphabet, kmer_code): + SCORE_THRESHOLD = 10 + + matrix = align.SubstitutionMatrix.std_nucleotide_matrix() + rule = align.ScoreThresholdRule(matrix, SCORE_THRESHOLD) + for kmer in kmer_code: + rule.similar_kmers(kmer_alphabet, kmer) diff --git a/doc/examples/scripts/sequence/homology/genome_comparison.py b/doc/examples/scripts/sequence/homology/genome_comparison.py index 066388ce5..52802ee0f 100644 --- a/doc/examples/scripts/sequence/homology/genome_comparison.py +++ b/doc/examples/scripts/sequence/homology/genome_comparison.py @@ -67,7 +67,7 @@ # Second, spaced *k-mers* :footcite:`Ma2002` are used instead of # continuous ones to increase the sensitivity. # However, not every spacing model performs equally well, so the proven -# one ``111∗1∗11∗1∗∗11∗111`` :footcite:`Choi2004` is used here. +# one ``111*1*11*1*11*111`` :footcite:`Choi2004` is used here. repeat_mask = tantan.TantanApp.mask_repeats(bacterium_seq) bacterium_seqs = [bacterium_seq, bacterium_seq.reverse(copy=False).complement()] @@ -75,7 +75,7 @@ table = align.KmerTable.from_sequences( k=12, sequences=bacterium_seqs, - spacing="111∗1∗11∗1∗∗11∗111", + spacing="111*1*11*1*11*111", ignore_masks=[repeat_mask, repeat_mask[::-1].copy()], ) diff --git a/pyproject.toml b/pyproject.toml index 018dff725..676f949d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,8 @@ ignore = [ "__init__.py" = ["F403", "TID252"] # Due to pymol scripts that are evaluated in other example scripts "doc/examples/**/*_pymol.py" = ["F821"] +# Due to 'Table' class used as parametrized argument in test functions +"benchmarks/sequence/align/benchmark_kmers.py" = ["N803"] [tool.ruff.lint.flake8-tidy-imports] ban-relative-imports = "all"