Skip to content

Commit

Permalink
Add pairaln
Browse files Browse the repository at this point in the history
  • Loading branch information
martin-steinegger committed Sep 22, 2021
1 parent be11943 commit 9a0df0d
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 2 deletions.
1 change: 1 addition & 0 deletions src/CommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ extern int tsv2exprofiledb(int argc, const char **argv, const Command& command);
extern int enrich(int argc, const char **argv, const Command& command);
extern int expandaln(int argc, const char **argv, const Command& command);
extern int expand2profile(int argc, const char **argv, const Command& command);
extern int pairaln(int argc, const char **argv, const Command& command);
extern int countkmer(int argc, const char **argv, const Command& command);
extern int extractalignedregion(int argc, const char **argv, const Command& command);
extern int extractdomains(int argc, const char **argv, const Command& command);
Expand Down
11 changes: 9 additions & 2 deletions src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1134,8 +1134,15 @@ std::vector<Command> baseCommands = {
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
{"profileDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::profileDb }}},


{"pairaln", pairaln, &par.threadsandcompression, COMMAND_EXPERT,
"Pair sequences to match best protein A and B from an species",
NULL,
"Martin Steinegger <martin.steinegger@snu.ac.kr>",
"<i:queryDB> <i:targetDB> <i:alnDB> <o:alnDB|ca3mDB>",
CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_TAXONOMY, &DbValidator::sequenceDb },
{"alnDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
{"alnDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
{"diffseqdbs", diffseqdbs, &par.diff, COMMAND_SPECIAL,
"Compute diff of two sequence DBs",
// "It creates 3 filtering files, that can be used in conjunction with \"createsubdb\" tool.\nThe first file contains the keys that has been removed from DBold to DBnew.\nThe second file maps the keys of the kept sequences from DBold to DBnew.\nThe third file contains the keys of the sequences that have been added in DBnew.",
Expand Down
1 change: 1 addition & 0 deletions src/util/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ set(util_source_files
util/msa2result.cpp
util/nrtotaxmapping.cpp
util/countkmer.cpp
util/pairaln.cpp
util/prefixid.cpp
util/profile2pssm.cpp
util/profile2seq.cpp
Expand Down
117 changes: 117 additions & 0 deletions src/util/pairaln.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#include "Util.h"
#include "Parameters.h"
#include "Matcher.h"
#include "Debug.h"
#include "DBReader.h"
#include "DBWriter.h"
#include "IndexReader.h"
#include "MemoryMapped.h"
#include "NcbiTaxonomy.h"
#include "MappingReader.h"

#define ZSTD_STATIC_LINKING_ONLY
#include <zstd.h>

#include <map>

#ifdef OPENMP
#include <omp.h>
#endif

int pairaln(int argc, const char **argv, const Command& command) {
Parameters &par = Parameters::getInstance();
par.parseParameters(argc, argv, command, true, 0, 0);

const bool sameDB = par.db1.compare(par.db2) == 0 ? true : false;
const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP);

std::string db2NoIndexName = PrefilteringIndexReader::dbPathWithoutIndex(par.db2);
MappingReader* mapping = new MappingReader(db2NoIndexName);
const int dbaccessMode = (DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
IndexReader qDbr(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode);
IndexReader *tDbr;
if (sameDB) {
tDbr = &qDbr;
} else {
tDbr = new IndexReader(par.db2, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode);
}

DBReader<unsigned int> alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
alnDbr.open(DBReader<unsigned int>::NOSORT);
if(alnDbr.getSize() % 2 != 0){
Debug(Debug::ERROR) << "Alignment database does not seem to be paired\n";
EXIT(EXIT_FAILURE);
}
size_t localThreads = 1;
#ifdef OPENMP
localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1);
#endif

DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), localThreads, par.compressed, Parameters::DBTYPE_ALIGNMENT_RES);
resultWriter.open();

Debug::Progress progress(alnDbr.getSize());
#pragma omp parallel num_threads(localThreads)
{
unsigned int thread_idx = 0;
#ifdef OPENMP
thread_idx = static_cast<unsigned int>(omp_get_thread_num());
#endif
char buffer[1024 + 32768*4];
std::string result;
result.reserve(1024 * 1024);
std::vector<Matcher::result_t> resultA;
resultA.reserve(100000);
std::vector<Matcher::result_t> resultB;
resultB.reserve(100000);
std::unordered_map<unsigned int, Matcher::result_t *> findPair;
std::string outputA;
outputA.reserve(100000);
std::string outputB;
outputB.reserve(100000);
#pragma omp for schedule(static, 2)
for (size_t i = 0; i < alnDbr.getSize(); i+=2) {
progress.updateProgress();
progress.updateProgress();
resultA.clear();
resultB.clear();
outputA.clear();
outputB.clear();
Matcher::readAlignmentResults(resultA, alnDbr.getData(i, thread_idx), true);
Matcher::readAlignmentResults(resultB, alnDbr.getData(i+1, thread_idx), true);

for (size_t resIdx = 0; resIdx < resultA.size(); ++resIdx) {
unsigned int taxon = mapping->lookup(resultA[resIdx].dbKey);
if(findPair.find(taxon) == findPair.end()){
findPair.insert({taxon, &resultA[resIdx]});
}
}
// find pairs
for (size_t resIdx = 0; resIdx < resultB.size(); ++resIdx) {
unsigned int taxon = mapping->lookup(resultB[resIdx].dbKey);
// found pair!
if(findPair.find(taxon) != findPair.end()) {
bool hasBacktrace = (resultB[resIdx].backtrace.size() > 0);
size_t len = Matcher::resultToBuffer(buffer, *findPair[taxon], hasBacktrace, false, false);
outputA.append(buffer, len);
len = Matcher::resultToBuffer(buffer, resultB[resIdx], hasBacktrace, false, false);
outputB.append(buffer, len);
findPair.erase (taxon);
}
}
resultWriter.writeData(outputA.c_str(), outputA.length(), alnDbr.getDbKey(i), thread_idx);
resultWriter.writeData(outputB.c_str(), outputB.length(), alnDbr.getDbKey(i+1), thread_idx);
}
}

resultWriter.close();
// clean up
delete mapping;
alnDbr.close();
if (sameDB == false) {
delete tDbr;
}

return 0;
}

0 comments on commit 9a0df0d

Please sign in to comment.