Skip to content
Merged
33 changes: 25 additions & 8 deletions data/workflow/translated_search.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,18 @@ TMP_PATH="$4"
QUERY="$1"
QUERY_ORF="$1"
if [ -n "$QUERY_NUCL" ]; then
if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$1" "${TMP_PATH}/q_orfs_aa" ${ORF_PAR} \
|| fail "extract orfs step died"
if [ "$ORF_SKIP" = "TRUE" ]; then
if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractframes "$1" "${TMP_PATH}/q_orfs_aa" ${EXTRACT_FRAMES_PAR} \
|| fail "extractframes died"
fi
else
if notExists "${TMP_PATH}/q_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$1" "${TMP_PATH}/q_orfs_aa" ${ORF_PAR} \
|| fail "extract orfs step died"
fi
fi
QUERY="${TMP_PATH}/q_orfs_aa"
QUERY_ORF="${TMP_PATH}/q_orfs_aa"
Expand All @@ -34,10 +42,19 @@ TARGET="$2"
TARGET_ORF="$2"
if [ -n "$TARGET_NUCL" ]; then
if [ -n "$NO_TARGET_INDEX" ]; then
if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$2" "${TMP_PATH}/t_orfs_aa" ${ORF_PAR} \
|| fail "extract target orfs step died"
if [ "$ORF_SKIP" ]; then
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" extractframes "$2" "${TMP_PATH}/t_orfs_aa" ${EXTRACT_FRAMES_PAR} \
|| fail "extractframes died"
fi
else
if notExists "${TMP_PATH}/t_orfs_aa.dbtype"; then
# same here
# shellcheck disable=SC2086
"$MMSEQS" extractorfs "$2" "${TMP_PATH}/t_orfs_aa" ${ORF_PAR} \
|| fail "extract target orfs step died"
fi
fi
TARGET="${TMP_PATH}/t_orfs_aa"
TARGET_ORF="${TMP_PATH}/t_orfs_aa"
Expand Down
9 changes: 7 additions & 2 deletions src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ Parameters::Parameters():
PARAM_ORF_FILTER_S(PARAM_ORF_FILTER_S_ID, "--orf-filter-s", "ORF filter sensitivity", "Sensitivity used for query ORF prefiltering", typeid(float), (void *) &orfFilterSens, "^[0-9]*(\\.[0-9]+)?$"),
PARAM_ORF_FILTER_E(PARAM_ORF_FILTER_E_ID, "--orf-filter-e", "ORF filter e-value", "E-value threshold used for query ORF prefiltering", typeid(double), (void *) &orfFilterEval, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$"),
PARAM_LCA_SEARCH(PARAM_LCA_SEARCH_ID, "--lca-search", "LCA search mode", "Efficient search for LCA candidates", typeid(bool), (void *) &lcaSearch, "", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
PARAM_ORF_SKIP(PARAM_ORF_SKIP_ID, "--orf-skip", "ORF skip mode", "???", typeid(bool), (void *) &orfSkip, ""),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still TODO

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PARAM_ORF_SKIP(PARAM_ORF_SKIP_ID, "--orf-skip", "Extract frames instead of ORFs", "Extract frames instead of ORFs during translated search", typeid(bool), (void *) &orfSkip, ""),

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@martin-steinegger any better ideas for a parameter name?

// easysearch
PARAM_GREEDY_BEST_HITS(PARAM_GREEDY_BEST_HITS_ID, "--greedy-best-hits", "Greedy best hits", "Choose the best hits greedily to cover the query", typeid(bool), (void *) &greedyBestHits, ""),
// extractorfs
Expand Down Expand Up @@ -727,7 +728,9 @@ Parameters::Parameters():

// extract frames
extractframes.push_back(&PARAM_ORF_FORWARD_FRAMES);
extractframes.push_back(&PARAM_ORF_REVERSE_FRAMES);
extractframes.push_back(&PARAM_ORF_REVERSE_FRAMES);
extractframes.push_back(&PARAM_TRANSLATION_TABLE);
extractframes.push_back(&PARAM_TRANSLATE);
extractframes.push_back(&PARAM_CREATE_LOOKUP);
extractframes.push_back(&PARAM_THREADS);
extractframes.push_back(&PARAM_COMPRESSED);
Expand Down Expand Up @@ -1251,7 +1254,7 @@ Parameters::Parameters():
searchworkflow = combineList(searchworkflow, rescorediagonal);
searchworkflow = combineList(searchworkflow, result2profile);
searchworkflow = combineList(searchworkflow, extractorfs);
searchworkflow = combineList(searchworkflow, translatenucs);
searchworkflow = combineList(searchworkflow, extractframes);
searchworkflow = combineList(searchworkflow, splitsequence);
searchworkflow = combineList(searchworkflow, offsetalignment);
// needed for slice search, however all its parameters are already present in searchworkflow
Expand All @@ -1268,6 +1271,7 @@ Parameters::Parameters():
searchworkflow.push_back(&PARAM_RUNNER);
searchworkflow.push_back(&PARAM_REUSELATEST);
searchworkflow.push_back(&PARAM_REMOVE_TMP_FILES);
searchworkflow.push_back(&PARAM_ORF_SKIP);

linsearchworkflow = combineList(align, kmersearch);
linsearchworkflow = combineList(linsearchworkflow, swapresult);
Expand Down Expand Up @@ -2277,6 +2281,7 @@ void Parameters::setDefaults() {
orfFilterSens = 2.0;
orfFilterEval = 100;
lcaSearch = false;
orfSkip = false;

greedyBestHits = false;

Expand Down
2 changes: 2 additions & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,7 @@ class Parameters {
float orfFilterSens;
double orfFilterEval;
bool lcaSearch;
bool orfSkip;

// easysearch
bool greedyBestHits;
Expand Down Expand Up @@ -886,6 +887,7 @@ class Parameters {
PARAMETER(PARAM_ORF_FILTER_S)
PARAMETER(PARAM_ORF_FILTER_E)
PARAMETER(PARAM_LCA_SEARCH)
PARAMETER(PARAM_ORF_SKIP)

// easysearch
PARAMETER(PARAM_GREEDY_BEST_HITS)
Expand Down
214 changes: 179 additions & 35 deletions src/util/extractframes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "DBWriter.h"
#include "Matcher.h"
#include "Util.h"
#include "TranslateNucl.h"
#include "itoa.h"

#include "Orf.h"
Expand All @@ -23,16 +24,21 @@ int extractframes(int argc, const char **argv, const Command& command) {
DBReader<unsigned int> reader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
reader.open(DBReader<unsigned int>::NOSORT);

DBWriter sequenceWriter(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, reader.getDbtype());
int outputDbtype = reader.getDbtype();
if(par.translate) {
outputDbtype = Parameters::DBTYPE_AMINO_ACIDS;
}
DBWriter sequenceWriter(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, outputDbtype);
sequenceWriter.open();

DBWriter headerWriter(par.hdr2.c_str(), par.hdr2Index.c_str(), par.threads, false, Parameters::DBTYPE_GENERIC_DB);
headerWriter.open();

unsigned int forwardFrames = Orf::getFrames(par.forwardFrames);
unsigned int reverseFrames = Orf::getFrames(par.reverseFrames);
Debug::Progress progress(reader.getSize());

Debug::Progress progress(reader.getSize());
TranslateNucl translateNucl(static_cast<TranslateNucl::GenCode>(par.translationTable));
#pragma omp parallel
{
int thread_idx = 0;
Expand All @@ -46,6 +52,7 @@ int extractframes(int argc, const char **argv, const Command& command) {
queryFrom = 0;
}

char* aa = new char[par.maxSeqLen + 3 + 1];
char buffer[1024];
std::string reverseComplementStr;
reverseComplementStr.reserve(32000);
Expand All @@ -57,24 +64,91 @@ int extractframes(int argc, const char **argv, const Command& command) {
size_t dataLength = reader.getEntryLen(i);

size_t bufferLen;
switch (forwardFrames){
case Orf::FRAME_1:
// -1 to ignore the null byte copy the new line
sequenceWriter.writeData(data, dataLength - 1, key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(0), dataLength - 3, 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
case Orf::FRAME_2:
sequenceWriter.writeData(data + 1, dataLength - 2, key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(1), dataLength - 4, 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
case Orf::FRAME_3:
sequenceWriter.writeData(data + 2, dataLength - 3, key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(2), dataLength - 5, 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
if (forwardFrames & Orf::FRAME_1) {
size_t cur_dataLength = dataLength-1;
const char* cur_data = data;

sequenceWriter.writeStart(thread_idx);
if(par.translate){
if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) {
cur_dataLength = cur_dataLength - (cur_dataLength % 3);
}

if (cur_dataLength < 3) {
continue;
}

if (cur_dataLength > (3 * par.maxSeqLen)) {
cur_dataLength = (3 * par.maxSeqLen);
}
translateNucl.translate(aa, cur_data, cur_dataLength);
sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx);

}else{
sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx);
}
sequenceWriter.writeEnd(key, thread_idx);

bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(0), dataLength - 3, 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
}
if (forwardFrames & Orf::FRAME_2) {
size_t cur_dataLength = dataLength - 2;
const char* cur_data = data + 1;

sequenceWriter.writeStart(thread_idx);
if(par.translate){
if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) {
cur_dataLength = cur_dataLength - (cur_dataLength % 3);
}

if (cur_dataLength < 3) {
continue;
}

if (cur_dataLength > (3 * par.maxSeqLen)) {
cur_dataLength = (3 * par.maxSeqLen);
}
translateNucl.translate(aa, cur_data, cur_dataLength);
sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx);

}else{
sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx);
}
sequenceWriter.writeEnd(key, thread_idx);

bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(1), dataLength - 4, 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
}
if (forwardFrames & Orf::FRAME_3) {
size_t cur_dataLength = dataLength - 3;
const char* cur_data = data + 2;

sequenceWriter.writeStart(thread_idx);
if(par.translate){
if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) {
cur_dataLength = cur_dataLength - (cur_dataLength % 3);
}

if (cur_dataLength < 3) {
continue;
}

if (cur_dataLength > (3 * par.maxSeqLen)) {
cur_dataLength = (3 * par.maxSeqLen);
}
translateNucl.translate(aa, cur_data, cur_dataLength);
sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx);

}else{
sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx);
}
sequenceWriter.writeEnd(key, thread_idx);

bufferLen = Orf::writeOrfHeader(buffer, key, static_cast<size_t >(2), dataLength - 5, 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
}


if(reverseFrames != 0){
size_t sequenceLength = dataLength -2;
Expand All @@ -91,25 +165,95 @@ int extractframes(int argc, const char **argv, const Command& command) {
reverseComplementStr.push_back('\n');
}

switch (reverseFrames){
case Orf::FRAME_1:
sequenceWriter.writeData(reverseComplementStr.c_str(), reverseComplementStr.size(), key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 2, static_cast<size_t >(0), 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
case Orf::FRAME_2:
sequenceWriter.writeData(reverseComplementStr.c_str()+1, reverseComplementStr.size()-1, key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 3, static_cast<size_t >(1), 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
case Orf::FRAME_3:
sequenceWriter.writeData(reverseComplementStr.c_str()+2, reverseComplementStr.size()-2, key, thread_idx);
bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 4, static_cast<size_t >(2), 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
break;
if (reverseFrames & Orf::FRAME_1) {
size_t cur_dataLength = reverseComplementStr.size();
const char* cur_data = reverseComplementStr.c_str();

sequenceWriter.writeStart(thread_idx);
if(par.translate){
if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) {
cur_dataLength = cur_dataLength - (cur_dataLength % 3);
}

if (cur_dataLength < 3) {
continue;
}

if (cur_dataLength > (3 * par.maxSeqLen)) {
cur_dataLength = (3 * par.maxSeqLen);
}
translateNucl.translate(aa, cur_data, cur_dataLength);
sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx);

}else{
sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx);
}
sequenceWriter.writeEnd(key, thread_idx);

bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 2, static_cast<size_t >(0), 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
}

if (reverseFrames & Orf::FRAME_2) {
size_t cur_dataLength = reverseComplementStr.size() - 1;
const char* cur_data = reverseComplementStr.c_str() + 1;

sequenceWriter.writeStart(thread_idx);
if(par.translate){
if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) {
cur_dataLength = cur_dataLength - (cur_dataLength % 3);
}

if (cur_dataLength < 3) {
continue;
}

if (cur_dataLength > (3 * par.maxSeqLen)) {
cur_dataLength = (3 * par.maxSeqLen);
}
translateNucl.translate(aa, cur_data, cur_dataLength);
sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx);

}else{
sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx);
}
sequenceWriter.writeEnd(key, thread_idx);

bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 3, static_cast<size_t >(1), 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
}

if (reverseFrames & Orf::FRAME_3) {
size_t cur_dataLength = reverseComplementStr.size() - 2;
const char* cur_data = reverseComplementStr.c_str() + 2;

sequenceWriter.writeStart(thread_idx);
if(par.translate){
if ((cur_data[cur_dataLength] != '\n' && cur_dataLength % 3 != 0) && (cur_data[cur_dataLength - 1] == '\n' && (cur_dataLength - 1) % 3 != 0)) {
cur_dataLength = cur_dataLength - (cur_dataLength % 3);
}

if (cur_dataLength < 3) {
continue;
}

if (cur_dataLength > (3 * par.maxSeqLen)) {
cur_dataLength = (3 * par.maxSeqLen);
}
translateNucl.translate(aa, cur_data, cur_dataLength);
sequenceWriter.writeAdd(aa, (cur_dataLength / 3), thread_idx);

}else{
sequenceWriter.writeAdd(cur_data, cur_dataLength, thread_idx);
}
sequenceWriter.writeEnd(key, thread_idx);

bufferLen = Orf::writeOrfHeader(buffer, key, reverseComplementStr.size() - 4, static_cast<size_t >(2), 0, 0);
headerWriter.writeData(buffer, bufferLen, key, thread_idx);
}
reverseComplementStr.clear();
}
delete[] aa;
}
headerWriter.close(true);
sequenceWriter.close(true);
Expand Down
11 changes: 8 additions & 3 deletions src/workflow/Search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,8 @@ int search(int argc, const char **argv, const Command& command) {
for (size_t i = 0; i < par.extractorfs.size(); i++) {
par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
}
for (size_t i = 0; i < par.translatenucs.size(); i++) {
par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
for (size_t i = 0; i < par.extractframes.size(); i++) {
par.extractframes[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
}
for (size_t i = 0; i < par.splitsequence.size(); i++) {
par.splitsequence[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
Expand Down Expand Up @@ -541,9 +541,14 @@ int search(int argc, const char **argv, const Command& command) {
par.subDbMode = 1;
cmd.addVariable("CREATESUBDB_PAR", par.createParameterString(par.createsubdb).c_str());
par.translate = 1;
cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
cmd.addVariable("OFFSETALIGNMENT_PAR", par.createParameterString(par.offsetalignment).c_str());
cmd.addVariable("SEARCH", program.c_str());
if (par.orfSkip) {
cmd.addVariable("ORF_SKIP", "TRUE");
cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str());
} else {
cmd.addVariable("ORF_PAR", par.createParameterString(par.extractorfs).c_str());
}
program = std::string(tmpDir + "/translated_search.sh");
}else if(searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_NUCLEOTIDE &&
searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_NUCLEOTIDE){
Expand Down