-
Notifications
You must be signed in to change notification settings - Fork 129
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[wip] Load compressed sequences into database
This was tried, and ultimately a bit much overhead for `augur filter`. The usage of sequence data in `augur filter` is trivial - all the logic is done on the tabular metadata file which loads nicely into the database. Sequence data is simply iterated via a generator, and anything that has passed metadata and sequence index filters gets written out. This means no sequence data needs to be kept in memory, eliminating the need for loading all into a database, which comes with storage overhead: Sequence data at large scale should be compressed. Uncompressed open (GenBank) SARS-CoV-2 data as of today yielded a SQLite DB file of >80GB before I terminated the process due to limited local storage. The compression of all open (GenBank) SARS-CoV-2 sequences to date using zlib took 4.5 hours on my local machine, compared to 7 minutes using the current approach of reading and outputting on the fly.
- Loading branch information
Showing
3 changed files
with
98 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import sqlite3 | ||
import zlib | ||
from Bio import SeqIO | ||
from Bio.Seq import Seq | ||
from Bio.SeqRecord import SeqRecord | ||
|
||
from augur.io import open_file | ||
|
||
SEQUENCE_ID_COLUMN = 'id' | ||
SEQUENCE_VALUE_COLUMN = 'seq' | ||
|
||
def load_fasta(fasta_file:str, connection:sqlite3.Connection, table_name:str): | ||
"""Loads sequence data from a FASTA file.""" | ||
with connection: | ||
create_table_statement = f""" | ||
CREATE TABLE {table_name} ( | ||
{SEQUENCE_ID_COLUMN} TEXT, | ||
{SEQUENCE_VALUE_COLUMN} BLOB | ||
) | ||
""" | ||
connection.execute(create_table_statement) | ||
|
||
insert_statement = f""" | ||
INSERT INTO {table_name} | ||
VALUES (?,?) | ||
""" | ||
# TODO: format=VCF | ||
rows = _iter_sequences(fasta_file) | ||
try: | ||
with connection: | ||
connection.executemany(insert_statement, rows) | ||
except sqlite3.ProgrammingError as e: | ||
raise ValueError(f'Failed to load {fasta_file}.') from e | ||
|
||
|
||
def _iter_sequences(fasta_file:str, format="fasta"): | ||
"""Yield sequences.""" | ||
with open_file(fasta_file) as f: | ||
records = SeqIO.parse(f, format) | ||
for record in records: | ||
# yield (record.id, str(record.seq)) | ||
yield (record.id, zlib.compress(str(record.seq).encode())) | ||
|
||
def write_fasta(fasta_file:str, connection:sqlite3.Connection, table_name:str): | ||
rows = connection.execute(f""" | ||
SELECT {SEQUENCE_ID_COLUMN}, {SEQUENCE_VALUE_COLUMN} | ||
FROM {table_name} | ||
""") | ||
with open_file(fasta_file, 'w') as f: | ||
for row in rows: | ||
record = SeqRecord( | ||
Seq(zlib.decompress(row[1]).decode('UTF-8')), | ||
id=row[0], | ||
description='' | ||
) | ||
SeqIO.write(record, f, "fasta-2line") |