diff --git a/SNAPLib/BaseAligner.cpp b/SNAPLib/BaseAligner.cpp index eb9fb4cb..febad5f6 100644 --- a/SNAPLib/BaseAligner.cpp +++ b/SNAPLib/BaseAligner.cpp @@ -64,7 +64,7 @@ BaseAligner::BaseAligner( genomeIndex(i_genomeIndex), maxHitsToConsider(i_maxHitsToConsider), maxK(i_maxK), maxReadSize(i_maxReadSize), maxSeedsToUseFromCommandLine(i_maxSeedsToUseFromCommandLine), maxSeedCoverage(i_maxSeedCoverage), readId(-1), extraSearchDepth(i_extraSearchDepth), - explorePopularSeeds(false), stopOnFirstHit(false), stats(i_stats), + explorePopularSeeds(false), stopOnFirstHit(false), stats(i_stats), allMatches(NULL), noUkkonen(i_noUkkonen), noOrderedEvaluation(i_noOrderedEvaluation), noTruncation(i_noTruncation), minWeightToCheck(max(1u, i_minWeightToCheck)), maxSecondaryAlignmentsPerContig(i_maxSecondaryAlignmentsPerContig) /*++ @@ -226,6 +226,10 @@ Routine Description: } hashTableEpoch = 0; + if (genome->hasAltContigs()) { + MatchInfo* entries = (MatchInfo*)allocator->allocate(sizeof(MatchInfo)* maxHitsToConsider); + allMatches = new MatchInfoVector(entries, maxHitsToConsider); + } } @@ -652,6 +656,36 @@ Return Value: return; } + /** + * Add up the highest-probability matches of all overlapping alternates + */ + double +BaseAligner::computeLiftedCandidateProbability( + GenomeDistance length) +{ + std::sort(allMatches->begin(), allMatches->end(), matchInfoComparator); + double totalProbability = 0.0; + MatchInfo best(0, 0, 0); + GenomeLocation farthest; + for (int i = 0; i <= allMatches->size(); i++) { + MatchInfo m(0, 0, 0); + if (i == allMatches->size() || (m = (*allMatches)[i]).liftedLocation > farthest) { + totalProbability += best.matchProbability; + best = m; + } + else { + if (m.matchProbability > best.matchProbability) { + best = m; + } + GenomeLocation e = m.liftedLocation + length - 1; + if (e > farthest) { + farthest = e; + } + } + } + return totalProbability; +} + bool BaseAligner::score( bool forceResult, @@ -744,6 +778,10 @@ Return Value: #endif unsigned weightListToCheck = highestUsedWeightList; + bool anyAltMatches = false; + if (allMatches != NULL) { + allMatches->clear(); + } do { // @@ -764,6 +802,9 @@ Return Value: primaryResult->score = bestScore; if (bestScore <= maxK) { primaryResult->location = bestScoreGenomeLocation; + if (anyAltMatches) { + probabilityOfAllCandidates = computeLiftedCandidateProbability(read[0]->getDataLength()); + } primaryResult->mapq = computeMAPQ(probabilityOfAllCandidates, probabilityOfBestCandidate, bestScore, popularSeedsSkipped); if (primaryResult->mapq >= MAPQ_LIMIT_FOR_SINGLE_HIT) { primaryResult->status = SingleHit; @@ -913,6 +954,14 @@ Return Value: // We could mark as scored anything in between the old and new genome offsets, but it's probably not worth the effort since this is // so rare and all it would do is same time. // + + // remember in case there are alt matches + if (allMatches != NULL) { + if ((! anyAltMatches) && genome->getLiftedLocation(genomeLocation) != genomeLocation) { + anyAltMatches = true; + } + allMatches->push_back(MatchInfo(genomeLocation, genome->getLiftedLocation(genomeLocation), matchProbability)); + } } } } else { // if we had genome data to compare against @@ -1114,7 +1163,6 @@ Return Value: return false; } - void BaseAligner::prefetchHashTableBucket(GenomeLocation genomeLocation, Direction direction) { @@ -1455,17 +1503,18 @@ BaseAligner::getBigAllocatorReservation(GenomeIndex *index, bool ownLandauVishki } return - contigCounters + - sizeof(_uint64) * 14 + // allow for alignment - sizeof(BaseAligner) + // our own member variables + contigCounters + + sizeof(_uint64)* 14 + // allow for alignment + sizeof(BaseAligner)+ // our own member variables (ownLandauVishkin ? - LandauVishkin<>::getBigAllocatorReservation() + - LandauVishkin<-1>::getBigAllocatorReservation() : 0) + // our LandauVishkin objects - sizeof(char) * maxReadSize * 2 + // rcReadData - sizeof(char) * maxReadSize * 4 + 2 * MAX_K + // reversed read (both) - sizeof(BYTE) * (maxReadSize + 7 + 128) / 8 + // seed used - sizeof(HashTableElement) * hashTableElementPoolSize + // hash table element pool - sizeof(HashTableAnchor) * candidateHashTablesSize * 2 + // candidate hash table (both) + LandauVishkin<>::getBigAllocatorReservation() + + LandauVishkin<-1>::getBigAllocatorReservation() : 0) + // our LandauVishkin objects + sizeof(char)* maxReadSize * 2 + // rcReadData + sizeof(char)* maxReadSize * 4 + 2 * MAX_K + // reversed read (both) + sizeof(BYTE)* (maxReadSize + 7 + 128) / 8 + // seed used + sizeof(HashTableElement)* hashTableElementPoolSize + // hash table element pool + sizeof(HashTableAnchor)* candidateHashTablesSize * 2 + // candidate hash table (both) + (index->getGenome()->hasAltContigs() ? sizeof(MatchInfo) * maxHitsToConsider : 0) + // matches for alt contigs sizeof(HashTableElement) * (maxSeedsToUse + 1); // weight lists } diff --git a/SNAPLib/BaseAligner.h b/SNAPLib/BaseAligner.h index 61a24cba..5a876c04 100644 --- a/SNAPLib/BaseAligner.h +++ b/SNAPLib/BaseAligner.h @@ -34,6 +34,7 @@ Revision History: #include "AlignerStats.h" #include "directions.h" #include "GenomeIndex.h" +#include "VariableSizeVector.h" extern bool doAlignerPrefetch; @@ -326,6 +327,27 @@ class BaseAligner { AlignerStats *stats; + typedef struct MatchInfo + { + GenomeLocation location; + GenomeLocation liftedLocation; + double matchProbability; + + MatchInfo(GenomeLocation _loc, GenomeLocation _lifted, double _p) : + location(_loc), liftedLocation(_lifted), matchProbability(_p) {} + } MatchInfo; + + static bool matchInfoComparator(const BaseAligner::MatchInfo& a, const BaseAligner::MatchInfo& b) + { + return a.liftedLocation < b.liftedLocation; + } + + typedef VariableSizeVector<MatchInfo,0> MatchInfoVector; + + MatchInfoVector* allMatches; + + double computeLiftedCandidateProbability(GenomeDistance length); + unsigned *hitCountByExtraSearchDepth; // How many hits at each depth bigger than the current best edit distance. // So if the current best hit has edit distance 2, then hitCountByExtraSearchDepth[0] would // be the count of hits at edit distance 2, while hitCountByExtraSearchDepth[2] would be the count diff --git a/SNAPLib/FASTA.cpp b/SNAPLib/FASTA.cpp index 25418eef..05ae845d 100644 --- a/SNAPLib/FASTA.cpp +++ b/SNAPLib/FASTA.cpp @@ -36,7 +36,10 @@ ReadFASTAGenome( const char *fileName, const char *pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator, - unsigned chromosomePaddingSize) + unsigned chromosomePaddingSize, + const char *chrTag, + const char *chrMapFilename, + AltContigMap* altMap) { // // We need to know a bound on the size of the genome before we create the Genome object. @@ -53,15 +56,38 @@ ReadFASTAGenome( isValidGenomeCharacter['A'] = isValidGenomeCharacter['T'] = isValidGenomeCharacter['C'] = isValidGenomeCharacter['G'] = isValidGenomeCharacter['N'] = true; isValidGenomeCharacter['a'] = isValidGenomeCharacter['t'] = isValidGenomeCharacter['c'] = isValidGenomeCharacter['g'] = isValidGenomeCharacter['n'] = true; + int lineBufferSize = 0; + char *lineBuffer; + + map<string, string> chrMap; + if (chrMapFilename != NULL) { + FILE* mapFile = fopen(chrMapFilename, "r"); + if (mapFile == NULL) { + WriteErrorMessage("Unable to open -chrmap file '%s'\n", chrMapFilename); + return NULL; + } + while (NULL != reallocatingFgets(&lineBuffer, &lineBufferSize, mapFile)) { + if (lineBuffer[0] == '#') { + continue; + } + string chrom; + for (char * token = strtok(lineBuffer, "\t\r\n"); token != NULL; token = strtok(NULL, "\t\r\n")) { + if (token == lineBuffer) { + chrom = string(token); + } else { + chrMap[string(token)] = chrom; + } + } + } + fclose(mapFile); + } + FILE *fastaFile = fopen(fileName, "r"); if (fastaFile == NULL) { WriteErrorMessage("Unable to open FASTA file '%s' (even though we already got its size)\n",fileName); return NULL; } - int lineBufferSize = 0; - char *lineBuffer; - // // Count the chromosomes // @@ -96,33 +122,59 @@ ReadFASTAGenome( // // Now supply the chromosome name. // - if (NULL != pieceNameTerminatorCharacters) { - for (int i = 0; i < strlen(pieceNameTerminatorCharacters); i++) { - char *terminator = strchr(lineBuffer+1, pieceNameTerminatorCharacters[i]); - if (NULL != terminator) { - *terminator = '\0'; + const char *chrName; + int chrNameLen; + if (chrTag == NULL) { + char * terminator = lineBuffer + strlen(lineBuffer); + char * p; + if (NULL != pieceNameTerminatorCharacters) { + for (int i = 0; i < strlen(pieceNameTerminatorCharacters); i++) { + p = strchr(lineBuffer + 1, pieceNameTerminatorCharacters[i]); + if (NULL != p && p < terminator) { + terminator = p; + } } } - } - if (spaceIsAPieceNameTerminator) { - char *terminator = strchr(lineBuffer, ' '); - if (NULL != terminator) { - *terminator = '\0'; + if (spaceIsAPieceNameTerminator) { + p = strchr(lineBuffer, ' '); + if (NULL != p && p < terminator) { + terminator = p; + } + p = strchr(lineBuffer, '\t'); + if (NULL != p && p < terminator) { + terminator = p; + } } - terminator = strchr(lineBuffer, '\t'); - if (NULL != terminator) { - *terminator = '\0'; + p = strchr(lineBuffer, '\n'); + if (NULL != p && p < terminator) { + terminator = p; + } + p = strchr(lineBuffer, '\r'); + if (NULL != p && p < terminator) { + terminator = p; + } + chrName = lineBuffer + 1; + chrNameLen = (int) (terminator - lineBuffer - 1); + } else { + if (!FindFASTATagValue(lineBuffer, chrTag, &chrName, &chrNameLen)) { + WriteErrorMessage("Unable to find tag '%s' in contig '%s'\n", chrTag, lineBuffer + 1); + soft_exit(1); + } + if (chrMapFilename != NULL) { + map<string,string>::iterator mapped = chrMap.find(string(chrName, chrName + chrNameLen)); + if (mapped != chrMap.end()) { + chrName = mapped->second.data(); + chrNameLen = (int) mapped->second.length(); + } } } - char *terminator = strchr(lineBuffer, '\n'); - if (NULL != terminator) { - *terminator = '\0'; - } - terminator = strchr(lineBuffer, '\r'); - if (NULL != terminator) { - *terminator = '\0'; + if (altMap != NULL) { + altMap->addFastaContig(lineBuffer, chrName, chrNameLen); } - genome->startContig(lineBuffer+1); + char *contigName = (char*) malloc(chrNameLen + 1); + memcpy(contigName, chrName, chrNameLen); + contigName[chrNameLen] = '\0'; + genome->startContig(contigName, altMap); } else { if (!inAContig) { WriteErrorMessage("\nFASTA file doesn't beging with a contig name (i.e., the first line doesn't start with '>').\n"); @@ -170,6 +222,9 @@ ReadFASTAGenome( // genome->addData(paddingBuffer); genome->fillInContigLengths(); + if (altMap != NULL) { + genome->adjustAltContigs(altMap); + } genome->sortContigsByName(); fclose(fastaFile); @@ -198,3 +253,23 @@ bool AppendFASTAGenome(const Genome *genome, FILE *fasta, const char *prefix="") } return !ferror(fasta); } + + bool +FindFASTATagValue(const char* lineBuffer, const char* tagName, const char ** pTagValue, int * pValueLength) +{ + const char *tag = lineBuffer; + do { + tag = strstr(tag + 1, tagName); + if (tag == NULL) { + return false; + } + } while (tag[-1] != '>' && tag[-1] != '|' && tag[strlen(tagName)] != '|'); + *pTagValue = tag + strlen(tagName) + 1; // Format is "tag|value| + const char *tagValueEnd = strchr(*pTagValue, '|'); + if (tagValueEnd == NULL) { + WriteErrorMessage("Badly formatted tag '%s' in contig '%s'\n", tag, lineBuffer + 1); + soft_exit(1); + } + *pValueLength = (int) (tagValueEnd - *pTagValue); + return true; +} diff --git a/SNAPLib/FASTA.h b/SNAPLib/FASTA.h index 6e542f32..f3c0e3d3 100644 --- a/SNAPLib/FASTA.h +++ b/SNAPLib/FASTA.h @@ -27,7 +27,7 @@ Revision History: #include "Genome.h" const Genome * -ReadFASTAGenome(const char *fileName, const char *pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator, unsigned chromosomePaddingSize); +ReadFASTAGenome(const char *fileName, const char *pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator, unsigned chromosomePaddingSize, const char* chrTag, const char* chrMapFilename, AltContigMap* altMap); // // The FASTA appending functions return whether the write was successful. @@ -40,10 +40,6 @@ ReadFASTAGenome(const char *fileName, const char *pieceNameTerminatorCharacters, bool AppendFASTAGenome(const Genome *, FILE *fasta); -// -// This is arbitrary; is there some existing convention? -// -inline const char *diploidFASTASexPrefix(bool male) -{ - return male ? "PATERNAL|" : "MATERNAL|"; -} +// utility for parsing FASTA tags + bool +FindFASTATagValue(const char* lineBuffer, const char* tag, const char ** pTagValue, int * pValueLength); diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp index 4a0629ec..490393b5 100755 --- a/SNAPLib/Genome.cpp +++ b/SNAPLib/Genome.cpp @@ -24,6 +24,7 @@ Revision History: #include "stdafx.h" #include "Genome.h" +#include "FASTA.h" #include "GenericFile.h" #include "GenericFile_map.h" #include "Compat.h" @@ -31,10 +32,14 @@ Revision History: #include "exit.h" #include "Error.h" #include "Util.h" +#include "VariableSizeVector.h" + +#include <string> +using namespace std; Genome::Genome(GenomeDistance i_maxBases, GenomeDistance nBasesStored, unsigned i_chromosomePadding, unsigned i_maxContigs) : maxBases(i_maxBases), minLocation(0), maxLocation(i_maxBases), chromosomePadding(i_chromosomePadding), maxContigs(i_maxContigs), - mappedFile(NULL) +mappedFile(NULL), minAltLocation(i_maxBases) { bases = ((char *) BigAlloc(nBasesStored + 2 * N_PADDING)) + N_PADDING; if (NULL == bases) { @@ -73,7 +78,7 @@ Genome::addData(const char *data) } void -Genome::startContig(const char *contigName) +Genome::startContig(const char *contigName, AltContigMap *altMap) { if (nContigs == maxContigs) { // @@ -102,6 +107,10 @@ Genome::startContig(const char *contigName) strncpy(contigs[nContigs].name,contigName,len); contigs[nContigs].name[len-1] = '\0'; + if (altMap != NULL) { + altMap->setAltContig(&contigs[nContigs]); + } + nContigs++; } @@ -149,7 +158,13 @@ Genome::saveToFile(const char *fileName) const curChar = contigs[i].name + n; if (*curChar == ' '){ *curChar = '_'; } } - fprintf(saveFile,"%lld %s\n",contigs[i].beginningLocation, contigs[i].name); + if (!hasAltContigs()) { + // backward compatible for genomes without alts + fprintf(saveFile, "%lld %s\n", contigs[i].beginningLocation, contigs[i].name); + } else { + fprintf(saveFile, "%lld %s %d %d %lld\n", contigs[i].beginningLocation, contigs[i].name, + contigs[i].isAlternate ? 1 : 0, contigs[i].isAlternateRC ? 1 : 0, contigs[i].liftedLocation); + } } // @@ -218,9 +233,7 @@ Genome::loadFromFile(const char *fileName, unsigned chromosomePadding, GenomeLoc int contigNameBufferSize = 0; char *contigNameBuffer = NULL; - unsigned n; - size_t contigSize; - char *curName; + genome->minAltLocation = nBases; for (unsigned i = 0; i < nContigs; i++) { if (NULL == reallocatingFgetsGenericFile(&contigNameBuffer, &contigNameBufferSize, loadFile)) { WriteErrorMessage("Unable to read contig description\n"); @@ -229,29 +242,50 @@ Genome::loadFromFile(const char *fileName, unsigned chromosomePadding, GenomeLoc return NULL; } - for (n = 0; n < (unsigned)contigNameBufferSize; n++) { - if (contigNameBuffer[n] == ' ') { - contigNameBuffer[n] = '\0'; - break; - } - } - + contigNameBuffer[contigNameBufferSize - 1] = '\0'; _int64 contigStart; - if (1 != sscanf(contigNameBuffer, "%lld", &contigStart)) { - WriteErrorMessage("Unable to parse contig start in genome file '%s', '%s%'\n", fileName, contigNameBuffer); + const char* SEP = " \n\r"; + char *token = strtok(contigNameBuffer, SEP); + if (token == NULL || 1 != sscanf(token, "%lld", &contigStart)) { +err_contig_parse: + WriteErrorMessage("Unable to parse contigs in genome file '%s', '%s%'\n", fileName, contigNameBuffer); soft_exit(1); } genome->contigs[i].beginningLocation = GenomeLocation(contigStart); - contigNameBuffer[n] = ' '; - n++; // increment n so we start copying at the position after the space - contigSize = strlen(contigNameBuffer + n) - 1; //don't include the final \n - genome->contigs[i].name = new char[contigSize + 1]; - genome->contigs[i].nameLength = (unsigned)contigSize; - curName = genome->contigs[i].name; - for (unsigned pos = 0; pos < contigSize; pos++) { - curName[pos] = contigNameBuffer[pos + n]; - } - curName[contigSize] = '\0'; + token = strtok(NULL, SEP); + if (token == NULL) goto err_contig_parse; + genome->contigs[i].name = new char[strlen(token) + 1]; + genome->contigs[i].nameLength = (unsigned)strlen(token); + strcpy(genome->contigs[i].name, token); + token = strtok(NULL, SEP); + if (token == NULL) { + genome->contigs[i].isAlternate = false; + genome->contigs[i].isAlternateRC = false; + genome->contigs[i].liftedLocation = InvalidGenomeLocation; + } else { + int isAlternate; + if (1 != sscanf(token, "%d", &isAlternate)) { + goto err_contig_parse; + } + genome->contigs[i].isAlternate = isAlternate != 0; + int isAlternateRC; + token = strtok(NULL, SEP); + if (token == NULL || 1 != sscanf(token, "%d", &isAlternateRC)) { + goto err_contig_parse; + } + genome->contigs[i].isAlternateRC = isAlternateRC != 0; + _int64 liftedLocation; + token = strtok(NULL, SEP); + if (token == NULL || 1 != sscanf(token, "%lld", &liftedLocation)) { + goto err_contig_parse; + } + genome->contigs[i].liftedLocation = liftedLocation; + + if (isAlternate && contigStart < GenomeLocationAsInt64(genome->minAltLocation)) { + genome->minAltLocation = contigStart - chromosomePadding / 2; + } + } + } // for each contig if (0 != loadFile->advance(GenomeLocationAsInt64(minLocation))) { @@ -465,6 +499,63 @@ void Genome::fillInContigLengths() contigs[nContigs-1].length = nBases - GenomeLocationAsInt64(contigs[nContigs-1].beginningLocation); } +void Genome::adjustAltContigs(AltContigMap* altMap) +{ + if (altMap != NULL) { + bool error = false; + // build parent links from alt contigs, and find minAltLocation + minAltLocation = maxBases; + for (int i = 0; i < nContigs; i++) { + if (contigs[i].isAlternate) { + if (contigs[i].beginningLocation < minAltLocation) { + minAltLocation = contigs[i].beginningLocation - chromosomePadding / 2; + } + GenomeDistance offset; + const char* parentName = altMap->getParentContigName(contigs[i].name, &offset); + if (parentName == NULL) { + WriteErrorMessage("Unable to find parent contig for alt contig %s\n", contigs[i].name); + error = true; + continue; + } + GenomeLocation parentLocation; + int parentIndex; + if (!getLocationOfContig(parentName, &parentLocation, &parentIndex)) { + WriteErrorMessage("Unable to find parent contig %s for alt contig %s\n", parentName, contigs[i].name); + error = true; + continue; + } + if (contigs[parentIndex].isAlternate) { + WriteErrorMessage("Alt contig %s has alt parent contig %s, should be non-alt\n", contigs[i].name, parentName); + error = true; continue; + } + contigs[i].liftedLocation = parentLocation + offset; + } + } + if (error) { + soft_exit(1); + } + } + + // flip RC contigs + for (int i = 0; i < nContigs; i++) { + if (contigs[i].isAlternate && contigs[i].isAlternateRC) { + util::toComplement(bases + GenomeLocationAsInt64(contigs[i].beginningLocation), NULL, (int) contigs[i].length - chromosomePadding); + } + } +} + +GenomeLocation Genome::getLiftedLocation(GenomeLocation altLocation) const +{ + if (altLocation < minAltLocation) { + return altLocation; + } + const Contig* alt = getContigAtLocation(altLocation + chromosomePadding / 2); + if (alt == NULL || ! alt->isAlternate) { + return altLocation; + } + return alt->liftedLocation + (altLocation - alt->beginningLocation); +} + const Genome::Contig *Genome::getContigForRead(GenomeLocation location, unsigned readLength, GenomeDistance *extraBasesClippedBefore) const { const Contig *contig = getContigAtLocation(location); @@ -491,4 +582,239 @@ const Genome::Contig *Genome::getContigForRead(GenomeLocation location, unsigned return contig; } -GenomeLocation InvalidGenomeLocation; // Gets set on genome build/load \ No newline at end of file +GenomeLocation InvalidGenomeLocation; // Gets set on genome build/load + +// terminate string at next tab or newline +// return pointer to beginning of next chunk of data +char* tokenizeToNextTabOrNewline(char* start, bool* endOfLine, bool* endOfFile) +{ + char* p = start; + while (*p) { + if (*p == '\t') { + *p = '\0'; + *endOfLine = false; + *endOfFile = false; + return p + 1; + } else if (*p == '\r' || *p == '\n') { + if (*(p + 1) != *p && (*(p + 1) == '\r' || *(p + 1) == '\n')) { + *p++ = '\0'; + } + *p = '\0'; + *endOfLine = true; + *endOfFile = false; + return p + 1; + } + p++; + } + *endOfLine = true; + *endOfFile = true; + return p; +} + +static const int ALT_SCAF_ACC = 0; +static const int PARENT_ACC = 1; +static const int ORI = 2; +static const int ALT_SCAF_START = 3; +static const int ALT_SCAF_STOP = 4; +static const int PARENT_START = 5; +static const int PARENT_STOP = 6; +static const int ALT_START_TAIL = 7; +static const int ALT_STOP_TAIL = 8; +static const int N_COLUMNS = 9; + +AltContigMap* AltContigMap::readFromFile(const char* filename, const char* columns) +{ + // just map & copy the whole file into a region of memory + FileMapper map; + if (!map.init(filename)) { +err_map_failed: + WriteErrorMessage("Failed to map file %s\n", filename); + soft_exit(1); + } + _int64 size = map.getFileSize(); + char* buffer = (char*)BigAlloc(size + 1 + strlen(columns) + 1); + void* token; + char* mapped = map.createMapping(0, size, &token); + if (mapped == NULL) { + goto err_map_failed; + } + memcpy(buffer, mapped, size); + buffer[size] = '\0'; + if (strlen(buffer) != size) { + WriteErrorMessage("Nulls in file %s\n", filename); + soft_exit(1); + } + map.unmap(token); + strcpy(buffer + size + 1, columns); + + AltContigMap* result = new AltContigMap(); + // first find accession FASTA tag, add "|" + char* p = buffer + size + 1; + if (*p == '#') { + p++; + } + char* q = strchr(p, ','); + if (q == NULL) { + WriteErrorMessage("Invalid columns spec %s\n", columns); + soft_exit(1); + } + *q = '\0'; + char * tag = (char*) malloc(q - p + 1); + strcpy(tag, p); + result->accessionFastaTag = tag; + + // get names for each column type (last 2 are optional) + p = q + 1; + char* columnNames[N_COLUMNS]; + memset(columnNames, 0, sizeof(char*) * N_COLUMNS); + for (int i = 0; i < N_COLUMNS; i++) { + columnNames[i] = p; + q = strchr(p, ','); + if (q != NULL) { + *q = '\0'; + p = q + 1; + } else { + q = p + strlen(p); + if (i < PARENT_STOP) { + WriteErrorMessage("Invalid columns spec %s\n", columns); + soft_exit(1); + } + break; + } + } + + // map column names to indices + VariableSizeVector<int> columnTypes; + p = buffer + (*buffer == '#'); // beginning of buffer, skipping possible comment char + bool endOfLine = false, endOfFile = false; + bool columnFound[N_COLUMNS]; + memset(columnFound, 0, sizeof(bool)* N_COLUMNS); + for (int columnIndex = 0; !endOfLine; columnIndex++) { + q = tokenizeToNextTabOrNewline(p, &endOfLine, &endOfFile); + if (q == NULL) { + WriteErrorMessage("Invalid file format for alt data in %s\n", filename); + soft_exit(1); + } + for (int i = 0; i <= N_COLUMNS; i++) { + if (i < N_COLUMNS && !strcmp(columnNames[i], p)) { + columnTypes.push_back(i); + columnFound[i] = true; + break; + } else if (i == N_COLUMNS) { + columnTypes.push_back(N_COLUMNS); // ignore this column + } + } + p = q; + } + for (int i = 0; i < N_COLUMNS; i++) { + if (columnNames[i] != NULL && !columnFound[i]) { + WriteErrorMessage("Invalid file format for alt data in %s\n", filename); + soft_exit(1); + } + } + while (!endOfFile) { + endOfLine = false; + AltContig alt; + for (int columnIndex = 0; !endOfLine; columnIndex++) { + q = tokenizeToNextTabOrNewline(p, &endOfLine, &endOfFile); + if (endOfFile) { + break; + } + switch (columnIndex < columnTypes.size() ? columnTypes[columnIndex] : N_COLUMNS) { + case ALT_SCAF_ACC: + alt.accession = p; + break; + case PARENT_ACC: + alt.parentAccession = p; + break; + case ORI: + alt.isRC = *p == '-'; + break; + case ALT_SCAF_START: + alt.start = atol(p); + break; + case ALT_SCAF_STOP: + alt.stop = atol(p); + break; + case PARENT_START: + alt.parentStart = atol(p); + break; + case PARENT_STOP: + alt.parentStop = atol(p); + break; + case ALT_START_TAIL: + alt.startTail = atol(p); + break; + case ALT_STOP_TAIL: + alt.stopTail = atol(p); + break; + case N_COLUMNS: + // ignore + break; + default: + _ASSERT(false); + } + p = q; + } + if (!endOfFile) { + result->altsByAccession[alt.accession] = alt; + } + } + return result; +} + +void AltContigMap::addFastaContig(const char* lineBuffer, const char* chrName, int chrNameLength) +{ + // get the name + string name(chrName, chrName + chrNameLength); + + // find the accession number + const char *accessionStart; + int accessionLength; + if (!FindFASTATagValue(lineBuffer, accessionFastaTag, &accessionStart, &accessionLength)) { + WriteErrorMessage("Unable to find accession code for contig %s in FASTA line\n%s\n", name, lineBuffer); + soft_exit(1); + } + string accession(accessionStart, accessionStart + accessionLength); + + nameToAccession[name] = accession; + accessionToName[accession] = name; + + StringAltContigMap::iterator alt = altsByAccession.find(accession); + if (alt != altsByAccession.end()) { + alt->second.name = (new string(name))->data(); // alloc & never free, but tiny :-) + } +} + +void AltContigMap::setAltContig(Genome::Contig* contig) +{ + StringMap::iterator accession = nameToAccession.find(contig->name); + if (accession != nameToAccession.end()) { + StringAltContigMap::iterator alt = altsByAccession.find(accession->second); + if (alt != altsByAccession.end()) { + contig->isAlternate = true; + contig->isAlternateRC = alt->second.isRC; + return; + } + } + contig->isAlternate = false; + contig->isAlternateRC = false; +} + +const char* AltContigMap::getParentContigName(const char* altName, GenomeDistance* pOffset) +{ + StringMap::iterator accession = nameToAccession.find(altName); + if (accession != nameToAccession.end()) { + StringAltContigMap::iterator alt = altsByAccession.find(accession->second); + if (alt != altsByAccession.end()) { + StringMap::iterator parent = accessionToName.find(alt->second.parentAccession); + if (parent != accessionToName.end()) { + if (pOffset != NULL) { + *pOffset = alt->second.parentStart - alt->second.start; + } + return parent->second.data(); + } + } + } + return NULL; +} diff --git a/SNAPLib/Genome.h b/SNAPLib/Genome.h index 84e94f52..f1d285e5 100644 --- a/SNAPLib/Genome.h +++ b/SNAPLib/Genome.h @@ -26,6 +26,8 @@ Revision History: #include "Compat.h" #include "GenericFile.h" #include "GenericFile_map.h" +#include <string> +#include <map> // // We have two different classes to represent a place in a genome and a distance between places in a genome. @@ -156,6 +158,8 @@ typedef _int64 GenomeDistance; extern GenomeLocation InvalidGenomeLocation; +class AltContigMap; + class Genome { public: // @@ -174,7 +178,8 @@ class Genome { unsigned maxContigs = 32); void startContig( - const char *contigName); + const char *contigName, + AltContigMap *altMap); void addData( const char *data); @@ -245,9 +250,15 @@ class Genome { } struct Contig { - Contig() : beginningLocation(InvalidGenomeLocation), length(0), nameLength(0), name(NULL) {} + Contig() : beginningLocation(InvalidGenomeLocation), length(0), nameLength(0), name(NULL), + isAlternate(false), isAlternateRC(false), liftedLocation(InvalidGenomeLocation) {} GenomeLocation beginningLocation; GenomeDistance length; + + bool isAlternate; + bool isAlternateRC; // if reversed alternate strand + GenomeLocation liftedLocation; // location of beginning of alt contig mapping to primary + unsigned nameLength; char *name; }; @@ -261,6 +272,13 @@ class Genome { const Contig *getNextContigAfterLocation(GenomeLocation location) const; int getContigNumAtLocation(GenomeLocation location) const; // Returns the contig number, which runs from 0 .. getNumContigs() - 1. + inline bool hasAltContigs() const { return minAltLocation < maxBases; } + + GenomeLocation getLiftedLocation(GenomeLocation altLocation) const; + + inline bool isAltLocation(GenomeLocation location) const + { return location != InvalidGenomeLocation && location >= minAltLocation && getLiftedLocation(location) != location; } + // unused Genome *copy() const {return copy(true,true,true);} // unused Genome *copyGenomeOneSex(bool useY, bool useM) const {return copy(!useY,useY,useM);} @@ -268,6 +286,7 @@ class Genome { // These are only public so creators of new genomes (i.e., FASTA) can use them. // void fillInContigLengths(); + void adjustAltContigs(AltContigMap* altMap); void sortContigsByName(); private: @@ -283,6 +302,8 @@ class Genome { GenomeLocation minLocation; GenomeLocation maxLocation; + GenomeLocation minAltLocation; + // // A genome is made up of a bunch of contigs, typically chromosomes. Contigs have names, // which are stored here. @@ -307,3 +328,39 @@ inline bool genomeLocationIsWithin(GenomeLocation locationA, GenomeLocation loca { return DistanceBetweenGenomeLocations(locationA, locationB) <= distance; } + +class AltContigMap +{ +public: + AltContigMap() {} + + static AltContigMap* readFromFile(const char* filename, const char* columnList); + + void addFastaContig(const char* lineBuffer, const char* chrName, int chrNameLength); + + void setAltContig(Genome::Contig* contig); + + const char* getParentContigName(const char* altName, GenomeDistance* pOffset = NULL); + +private: + + struct AltContig { + const char* name; + const char* accession; + const char* parentAccession; + bool isRC; + GenomeLocation start, stop; + GenomeLocation parentStart, parentStop; + GenomeLocation startTail, stopTail; + AltContig() : name(NULL), accession(NULL), parentAccession(NULL), isRC(false), + start(0), stop(0), parentStart(0), parentStop(0), startTail(0), stopTail(0) {} + }; + + + const char* accessionFastaTag; + typedef std::map<std::string, AltContig> StringAltContigMap; + StringAltContigMap altsByAccession; + typedef std::map<std::string, std::string> StringMap; + StringMap nameToAccession; + StringMap accessionToName; +}; diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp index 2f34ba82..06fb64d4 100644 --- a/SNAPLib/GenomeIndex.cpp +++ b/SNAPLib/GenomeIndex.cpp @@ -46,11 +46,12 @@ static const double DEFAULT_SLACK = 0.3; static const unsigned DEFAULT_PADDING = 500; static const unsigned DEFAULT_KEY_BYTES = 4; static const unsigned DEFAULT_LOCATION_SIZE = 4; - +static const char* DEFAULT_ALT_COLUMNS = "ref,alt_scaf_acc,parent_acc,ori,alt_scaf_start,alt_scaf_stop,parent_start,parent_stop,alt_start_tail,alt_stop_tail"; const char *GenomeIndexFileName = "GenomeIndex"; const char *OverflowTableFileName = "OverflowTable"; const char *GenomeIndexHashFileName = "GenomeIndexHash"; const char *GenomeFileName = "Genome"; +const char *LiftedIndexDirName = "Lifted"; static void usage() { @@ -85,12 +86,18 @@ static void usage() " In particular, this will generally use less memory than the index will use once it's built, so if this doesn't work you\n" " won't be able to use the index anyway. However, if you've got sufficient memory to begin with, this option will just\n" " slow down the index build by doing extra, useless IO.\n" + "-altmap file Tab-separated file of alt contig mapping information\n" + "-altcols columns Comma-separated list of columns describing alt mapping file\n" + " Default is v38 %s\n" + "-chrtag tag Tag for chrom name\n" + "-chrmap file Tab-separated file of chrom name and tag values\n" , DEFAULT_SEED_SIZE, DEFAULT_SLACK, DEFAULT_PADDING, DEFAULT_KEY_BYTES, - DEFAULT_LOCATION_SIZE); + DEFAULT_LOCATION_SIZE, + DEFAULT_ALT_COLUMNS); soft_exit_no_print(1); // Don't use soft-exit, it's confusing people to get an error message after the usage } @@ -121,6 +128,10 @@ GenomeIndex::runIndexer( bool large = false; unsigned locationSize = DEFAULT_LOCATION_SIZE; bool smallMemory = false; + const char* altMapFilename = NULL; + const char* altMapColumns = DEFAULT_ALT_COLUMNS; + const char* chrTag = NULL; + const char* chrMapFilename = NULL; for (int n = 2; n < argc; n++) { if (strcmp(argv[n], "-s") == 0) { @@ -172,8 +183,7 @@ GenomeIndex::runIndexer( } } else if (argv[n][0] == '-' && argv[n][1] == 's' && argv[n][2] == 'm') { smallMemory = true; - } - else if (strcmp(argv[n], "-keysize") == 0) { + } else if (strcmp(argv[n], "-keysize") == 0) { if (n + 1 < argc) { keySizeInBytes = atoi(argv[n+1]); if (keySizeInBytes < 4 || keySizeInBytes > 8) { @@ -188,12 +198,47 @@ GenomeIndex::runIndexer( pieceNameTerminatorCharacters = argv[n] + 2; } else if (!strcmp(argv[n], "-bSpace")) { spaceIsAPieceNameTerminator = true; + } else if (!strcmp(argv[n], "-altmap")) { + if (n + 1 < argc) { + altMapFilename = argv[n + 1]; + n++; + } else { + usage(); + } + } else if (!strcmp(argv[n], "-altcols")) { + if (n + 1 < argc) { + altMapColumns = argv[n + 1]; + n++; + } else { + usage(); + } + } else if (!strcmp(argv[n], "-chrtag")) { + if (n + 1 < argc) { + chrTag = argv[n + 1]; + n++; + } + else { + usage(); + } + } else if (!strcmp(argv[n], "-chrmap")) { + if (n + 1 < argc) { + chrMapFilename = argv[n + 1]; + n++; + } + else { + usage(); + } } else { WriteErrorMessage("Invalid argument: %s\n\n", argv[n]); usage(); } } + if (chrMapFilename != NULL && chrTag == NULL) { + WriteErrorMessage("The -chrmap option requires the -chrtag option to be specified\n"); + usage(); + } + if (seedLen < 16 || seedLen > 32) { // Seeds are stored in 64 bits, so they can't be larger than 32 bases for now. WriteErrorMessage("Seed length must be between 16 and 32, inclusive\n"); @@ -223,7 +268,10 @@ GenomeIndex::runIndexer( BigAllocUseHugePages = false; _int64 start = timeInMillis(); - const Genome *genome = ReadFASTAGenome(fastaFile, pieceNameTerminatorCharacters, spaceIsAPieceNameTerminator, chromosomePadding); + + AltContigMap* altMap = altMapFilename != NULL ? AltContigMap::readFromFile(altMapFilename, altMapColumns) : NULL; + + const Genome *genome = ReadFASTAGenome(fastaFile, pieceNameTerminatorCharacters, spaceIsAPieceNameTerminator, chromosomePadding, chrTag, chrMapFilename, altMap); if (NULL == genome) { WriteErrorMessage("Unable to read FASTA file\n"); soft_exit(1); @@ -261,12 +309,17 @@ SetInvalidGenomeLocation(unsigned locationSize) bool GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double slack, bool computeBias, const char *directoryName, unsigned maxThreads, unsigned chromosomePaddingSize, bool forceExact, unsigned hashTableKeySize, - bool large, const char *histogramFileName, unsigned locationSize, bool smallMemory) + bool large, const char *histogramFileName, unsigned locationSize, bool smallMemory, GenomeIndex* unliftedIndex) { PreventMachineHibernationWhileThisThreadIsAlive(); SetInvalidGenomeLocation(locationSize); + if (genome->hasAltContigs() && smallMemory) { + WriteErrorMessage("Warning: Cannot use small memory to build index with alt contigs, ignoring flag\n"); + smallMemory = false; + } + bool buildHistogram = (histogramFileName != NULL); FILE *histogramFile; if (buildHistogram) { @@ -282,22 +335,25 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla return false; } - int filenameBufferSize = (int)(strlen(directoryName) + 1 + __max(strlen(GenomeIndexFileName), __max(strlen(OverflowTableFileName), __max(strlen(GenomeIndexHashFileName), strlen(GenomeFileName)))) + 1); + int filenameBufferSize = (int)(strlen(directoryName) + 1 + __max(strlen(GenomeIndexFileName), __max(strlen(OverflowTableFileName), __max(strlen(GenomeIndexHashFileName), __max(strlen(GenomeFileName), strlen(LiftedIndexDirName))))) + 1); char *filenameBuffer = new char[filenameBufferSize]; - fprintf(stderr,"Saving genome..."); - _int64 start = timeInMillis(); - snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, GenomeFileName); - if (!genome->saveToFile(filenameBuffer)) { - WriteErrorMessage("GenomeIndex::saveToDirectory: Failed to save the genome itself\n"); - delete[] filenameBuffer; - return false; + _int64 start; + if (unliftedIndex == NULL) { + fprintf(stderr, "Saving genome..."); + start = timeInMillis(); + snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, GenomeFileName); + if (!genome->saveToFile(filenameBuffer)) { + WriteErrorMessage("GenomeIndex::saveToDirectory: Failed to save the genome itself\n"); + delete[] filenameBuffer; + return false; + } + fprintf(stderr, "%llds\n", (timeInMillis() + 500 - start) / 1000); } - fprintf(stderr,"%llds\n", (timeInMillis() + 500 - start) / 1000); GenomeIndex *index = new GenomeIndex(); index->genome = NULL; // We always delete the index when we're done, but we delete the genome first to save space during the overflow table build. - + GenomeDistance countOfBases = genome->getCountOfBases(); if (locationSize != 8 && countOfBases > ((_int64) 1 << (locationSize*8)) - 16) { WriteErrorMessage("Genome is too big for %d byte genome locations. Specify a larger location size with -locationSize\n", locationSize); @@ -306,6 +362,9 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla // Compute bias table sizes, unless we're using the precomputed ones hardcoded in BiasTables.cpp double *biasTable = NULL; + if (unliftedIndex != NULL) { + computeBias = true; + } if (!computeBias) { if (large) { biasTable = hg19_biasTables_large[hashTableKeySize][seedLen]; @@ -322,7 +381,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla if (computeBias) { unsigned nHashTables = 1 << ((max((unsigned)seedLen, hashTableKeySize * 4) - hashTableKeySize * 4) * 2); biasTable = new double[nHashTables]; - ComputeBiasTable(genome, seedLen, biasTable, maxThreads, forceExact, hashTableKeySize, large); + ComputeBiasTable(genome, seedLen, biasTable, maxThreads, forceExact, hashTableKeySize, large, unliftedIndex); } WriteStatusMessage("Allocating memory for hash tables..."); @@ -348,6 +407,19 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla start = timeInMillis(); volatile _int64 nextOverflowBackpointer = 0; + unsigned nThreads = __min(GetNumberOfProcessors(), maxThreads); + BuildHashTablesThreadContext *threadContexts = new BuildHashTablesThreadContext[nThreads]; + + ExclusiveLock *hashTableLocks = new ExclusiveLock[nHashTables]; + for (unsigned i = 0; i < nHashTables; i++) { + InitializeExclusiveLock(&hashTableLocks[i]); + } + + // lifted index needs to be done in two passes, first to build and then to sort + int liftedIndexPass = 0; + +lifted_index_pass_start: + volatile _int64 nonSeeds = 0; volatile _int64 seedsWithMultipleOccurrences = 0; volatile _int64 genomeLocationsInOverflowTable = 0; // Number of extra hits on duplicate indices. This should come out once we implement the overflow table. @@ -359,14 +431,6 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla SingleWaiterObject doneObject; CreateSingleWaiterObject(&doneObject); - unsigned nThreads = __min(GetNumberOfProcessors(), maxThreads); - BuildHashTablesThreadContext *threadContexts = new BuildHashTablesThreadContext[nThreads]; - - ExclusiveLock *hashTableLocks = new ExclusiveLock[nHashTables]; - for (unsigned i = 0; i < nHashTables; i++) { - InitializeExclusiveLock(&hashTableLocks[i]); - } - runningThreadCount = nThreads; GenomeDistance nextChunkToProcess = 0; @@ -391,6 +455,12 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla } } + index->seedLen = seedLen; + index->hashTableKeySize = hashTableKeySize; + index->largeHashTable = large; + index->locationSize = locationSize; + index->genome = genome; + for (unsigned i = 0; i < nThreads; i++) { threadContexts[i].whichThread = i; threadContexts[i].nThreads = nThreads; @@ -421,6 +491,8 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla threadContexts[i].backpointerSpillLock = &backpointerSpillLock; threadContexts[i].lastBackpointerIndexUsedByThread = lastBackpointerIndexUsedByThread; threadContexts[i].backpointerSpillFile = backpointerSpillFile; + threadContexts[i].unliftedIndex = unliftedIndex; + threadContexts[i].liftedIndexPass = liftedIndexPass; StartNewThread(BuildHashTablesWorkerThreadMain, &threadContexts[i]); } @@ -442,24 +514,28 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla // (_int64)hashTables[j]->GetUsedElementCount() * 100 / (_int64)hashTables[j]->GetTableSize()); } - WriteStatusMessage("%lld(%lld%%) seeds occur more than once, total of %lld(%lld%%) genome locations are not unique, %lld(%lld%%) bad seeds, %lld both complements used %lld no string\n", - seedsWithMultipleOccurrences, - (seedsWithMultipleOccurrences * 100) / countOfBases, - genomeLocationsInOverflowTable, - genomeLocationsInOverflowTable * 100 / countOfBases, - nonSeeds, - (nonSeeds * 100) / countOfBases, - bothComplementsUsed, - noBaseAvailable); - - WriteStatusMessage("Hash table build took %llds\n",(timeInMillis() + 500 - start) / 1000); - + if (unliftedIndex == NULL) { + WriteStatusMessage("%lld(%lld%%) seeds occur more than once, total of %lld(%lld%%) genome locations are not unique, %lld(%lld%%) bad seeds, %lld both complements used %lld no string\n", + seedsWithMultipleOccurrences, + (seedsWithMultipleOccurrences * 100) / countOfBases, + genomeLocationsInOverflowTable, + genomeLocationsInOverflowTable * 100 / countOfBases, + nonSeeds, + (nonSeeds * 100) / countOfBases, + bothComplementsUsed, + noBaseAvailable); + + WriteStatusMessage("Hash table build took %llds\n", (timeInMillis() + 500 - start) / 1000); + } // // We're done with the raw genome. Delete it to save some memory. // - delete genome; - genome = NULL; + if (!genome->hasAltContigs()) { + // delete if we won't need it later + delete genome; + genome = NULL; + } char *halfBuiltHashTableSpillFileName = NULL; @@ -493,6 +569,23 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla WriteStatusMessage("%llds\n", (timeInMillis() - spillDone + 500) / 1000); } + // declare variables before goto + _uint64 nBackpointersProcessed; + _int64 lastPrintTime; + const unsigned maxHistogramEntry = 500000; + _uint64 countOfTooBigForHistogram; + _uint64 sumOfTooBigForHistogram; + _uint64 largestSeed; + unsigned *histogram; + FILE *tablesFile; + size_t totalBytesWritten; + _uint64 overflowTableIndex; + _uint64 duplicateSeedsProcessed; + + if (unliftedIndex != NULL && liftedIndexPass == 1) { + goto lifted_skip_overflow; + } + WriteStatusMessage("Building overflow table.\n"); start = timeInMillis(); fflush(stdout); @@ -520,14 +613,13 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla soft_exit(1); } - _uint64 nBackpointersProcessed = 0; - _int64 lastPrintTime = timeInMillis(); + nBackpointersProcessed = 0; + lastPrintTime = timeInMillis(); - const unsigned maxHistogramEntry = 500000; - _uint64 countOfTooBigForHistogram = 0; - _uint64 sumOfTooBigForHistogram = 0; - _uint64 largestSeed = 0; - unsigned *histogram = NULL; + countOfTooBigForHistogram = 0; + sumOfTooBigForHistogram = 0; + largestSeed = 0; + histogram = NULL; if (buildHistogram) { histogram = new unsigned[maxHistogramEntry+1]; for (unsigned i = 0; i <= maxHistogramEntry; i++) { @@ -540,15 +632,15 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla // Write the hash tables as we go so that we can free their memory on the fly. // snprintf(filenameBuffer,filenameBufferSize,"%s%c%s", directoryName, PATH_SEP, GenomeIndexHashFileName); - FILE *tablesFile = fopen(filenameBuffer, "wb"); + tablesFile = fopen(filenameBuffer, "wb"); if (NULL == tablesFile) { WriteErrorMessage("Unable to open hash table file '%s'\n", filenameBuffer); soft_exit(1); } - size_t totalBytesWritten = 0; - _uint64 overflowTableIndex = 0; - _uint64 duplicateSeedsProcessed = 0; + totalBytesWritten = 0; + overflowTableIndex = 0; + duplicateSeedsProcessed = 0; for (unsigned whichHashTable = 0; whichHashTable < nHashTables; whichHashTable++) { if (NULL == hashTables[whichHashTable]) { @@ -664,14 +756,23 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla } totalBytesWritten += bytesWrittenThisHashTable; - delete hashTables[whichHashTable]; - hashTables[whichHashTable] = NULL; + if (genome == NULL || !genome->hasAltContigs()) { + delete hashTables[whichHashTable]; + hashTables[whichHashTable] = NULL; + } } // for each hash table fclose(tablesFile); _ASSERT(overflowTableIndex == index->overflowTableSize); // We used exactly what we expected to use. + if (unliftedIndex != NULL && liftedIndexPass == 0) { + liftedIndexPass = 1; + goto lifted_index_pass_start; + } + +lifted_skip_overflow: + delete overflowAnchor; overflowAnchor = NULL; @@ -687,6 +788,21 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla delete [] histogram; } + if (genome != NULL && genome->hasAltContigs() && unliftedIndex == NULL) { + // create a sub-index with only seeds that occur in alt contigs + // need to build lifted index here because it will reorder unlifted index overflow table + WriteStatusMessage("Creating sub-index for alt contigs...\n"); + snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, LiftedIndexDirName); + bool ok = BuildIndexToDirectory(genome, seedLen, slack, true, filenameBuffer, maxThreads, chromosomePaddingSize, forceExact, + hashTableKeySize, large, histogramFileName, locationSize, smallMemory, index); + if (!ok) { + WriteErrorMessage("Failed to build lifted index %s\n", filenameBuffer); + soft_exit(1); + return false; + } + WriteStatusMessage("...Finished creating sub-index for alt contigs\n"); + } + // // Now save out the part of the index that's independent of the genome itself. // @@ -737,20 +853,24 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla return false; } - fprintf(indexFile,"%d %d %d %lld %d %d %d %lld %d %d", GenomeIndexFormatMajorVersion, GenomeIndexFormatMinorVersion, index->nHashTables, + fprintf(indexFile,"%d %d %d %lld %d %d %d %lld %d %d", + // NOTE: this must be changed if the format no longer supports v5 (pre-alt) + genome != NULL && genome->hasAltContigs() ? GenomeIndexFormatMajorVersion : GenomeIndexFormatMajorVersionWithoutAlts, + GenomeIndexFormatMinorVersion, index->nHashTables, index->overflowTableSize, seedLen, chromosomePaddingSize, hashTableKeySize, totalBytesWritten, large ? 0 : 1, locationSize); fclose(indexFile); + index->genome = NULL; // deleted earlier delete index; if (computeBias && biasTable != NULL) { delete[] biasTable; } - + WriteStatusMessage("%llds\n", (timeInMillis() + 500 - start) / 1000); delete[] filenameBuffer; - + return true; } @@ -845,7 +965,7 @@ SNAPHashTable** GenomeIndex::allocateHashTables( -GenomeIndex::GenomeIndex() : nHashTables(0), hashTables(NULL), overflowTable32(NULL), overflowTable64(NULL), genome(NULL), tablesBlob(NULL), mappedOverflowTable(NULL), mappedTables(NULL) +GenomeIndex::GenomeIndex() : nHashTables(0), hashTables(NULL), overflowTable32(NULL), overflowTable64(NULL), genome(NULL), tablesBlob(NULL), mappedOverflowTable(NULL), mappedTables(NULL), hasAlts(false), liftedIndex(NULL) { } @@ -885,10 +1005,13 @@ GenomeIndex::~GenomeIndex() delete genome; genome = NULL; + if (NULL != liftedIndex) { + delete liftedIndex; + } } void -GenomeIndex::ComputeBiasTable(const Genome* genome, int seedLen, double* table, unsigned maxThreads, bool forceExact, unsigned hashTableKeySize, bool large) +GenomeIndex::ComputeBiasTable(const Genome* genome, int seedLen, double* table, unsigned maxThreads, bool forceExact, unsigned hashTableKeySize, bool large, const GenomeIndex* unliftedIndex) /** * Fill in table with the table size biases for a given genome and seed size. * We assume that table is already of the correct size for our seed size @@ -961,12 +1084,12 @@ GenomeIndex::ComputeBiasTable(const Genome* genome, int seedLen, double* table, _ASSERT(seed.getHighBases(hashTableKeySize) < nHashTables); - - if (NULL == seedsSeen->GetFirstValueForKey(seed.getBases())) { - _uint64 value = 42; - seedsSeen->Insert(seed.getBases(), &value); - numExactSeeds[seed.getHighBases(hashTableKeySize)]++; - } + bool addSeed = unliftedIndex == NULL || unliftedIndex->hasAnyAltHitsAndLocationIsFirst(seed, i, large); + if (addSeed && NULL == seedsSeen->GetFirstValueForKey(seed.getBases())) { + _uint64 value = 42; + seedsSeen->Insert(seed.getBases(), &value); + numExactSeeds[seed.getHighBases(hashTableKeySize)]++; + } } // for (unsigned i = 0; i < nHashTables; i++) printf("Hash table %d is predicted to have %lld entries\n", i, numExactSeeds[i]); @@ -1010,6 +1133,8 @@ GenomeIndex::ComputeBiasTable(const Genome* genome, int seedLen, double* table, contexts[i].validSeeds = &validSeeds; contexts[i].approximateCounterLocks = locks; contexts[i].large = large; + contexts[i].unliftedIndex = unliftedIndex; + StartNewThread(ComputeBiasTableWorkerThreadMain, &contexts[i]); } @@ -1114,19 +1239,20 @@ GenomeIndex::ComputeBiasTableWorkerThreadMain(void *param) _ASSERT(whichHashTable < context->nHashTables); - if (batches[whichHashTable].addSeed(seed.getLowBases(context->hashTableKeySize))) { - PerCounterBatch *batch = &batches[whichHashTable]; - AcquireExclusiveLock(&context->approximateCounterLocks[whichHashTable]); - batch->apply(&(*context->approxCounters)[whichHashTable]); - ReleaseExclusiveLock(&context->approximateCounterLocks[whichHashTable]); + bool addSeed = context->unliftedIndex == NULL || context->unliftedIndex->hasAnyAltHitsAndLocationIsFirst(seed, i, large); + if (addSeed && batches[whichHashTable].addSeed(seed.getLowBases(context->hashTableKeySize))) { + PerCounterBatch *batch = &batches[whichHashTable]; + AcquireExclusiveLock(&context->approximateCounterLocks[whichHashTable]); + batch->apply(&(*context->approxCounters)[whichHashTable]); + ReleaseExclusiveLock(&context->approximateCounterLocks[whichHashTable]); - _int64 basesProcessed = InterlockedAdd64AndReturnNewValue(context->nBasesProcessed, PerCounterBatch::nSeedsPerBatch + unrecordedSkippedSeeds); + _int64 basesProcessed = InterlockedAdd64AndReturnNewValue(context->nBasesProcessed, PerCounterBatch::nSeedsPerBatch + unrecordedSkippedSeeds); - if ((_uint64)basesProcessed / printBatchSize > ((_uint64)basesProcessed - PerCounterBatch::nSeedsPerBatch - unrecordedSkippedSeeds)/printBatchSize) { - WriteStatusMessage("Bias computation: %lld / %lld\n",(basesProcessed/printBatchSize)*printBatchSize, (_int64)countOfBases); - } - unrecordedSkippedSeeds= 0; // We've now recorded them. - } + if ((_uint64)basesProcessed / printBatchSize > ((_uint64)basesProcessed - PerCounterBatch::nSeedsPerBatch - unrecordedSkippedSeeds) / printBatchSize) { + WriteStatusMessage("Bias computation: %lld / %lld\n", (basesProcessed / printBatchSize)*printBatchSize, (_int64)countOfBases); + } + unrecordedSkippedSeeds = 0; // We've now recorded them. + } } for (unsigned i = 0; i < context->nHashTables; i++) { @@ -1154,7 +1280,41 @@ GenomeIndex::ComputeBiasTableWorkerThreadMain(void *param) } } - +bool +GenomeIndex::hasAnyAltHitsAndLocationIsFirst( + Seed seed, GenomeLocation genomeLocation, bool large) const +{ + _int64 nHits, nRCHits; + if (doesGenomeIndexHave64BitLocations()) { + const GenomeLocation *hits, *rcHits; + GenomeLocation singleHit[2], singleRCHit[2]; + lookupSeed(seed, &nHits, &hits, &nRCHits, &rcHits, &singleHit[1], &singleRCHit[1]); +#define HAS_ANY_ALTS \ + bool isFirst = large \ + ? (nHits > 0 && genomeLocation == *hits && (nRCHits == 0 || *hits <= *rcHits)) || \ + (nRCHits > 0 && genomeLocation == *rcHits && (nHits == 0 || *rcHits < *hits)) \ + : nHits > 0 && genomeLocation == *hits; \ + if (isFirst) { \ + for (int i = 0; i < nHits; i++) { \ + if (genome->isAltLocation(hits[i])) { \ + return true; \ + } \ + } \ + for (int i = 0; i < nRCHits; i++) { \ + if (genome->isAltLocation(rcHits[i])) { \ + return true; \ + } \ + } \ + } \ + return false; + HAS_ANY_ALTS + } else { + const unsigned *hits, *rcHits; + lookupSeed32(seed, &nHits, &hits, &nRCHits, &rcHits); + HAS_ANY_ALTS + } +} +#undef HAS_ANY_ALTS void GenomeIndex::BuildHashTablesWorkerThreadMain(void *param) @@ -1171,6 +1331,7 @@ GenomeIndex::BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context) const Genome *genome = context->genome; unsigned seedLen = context->seedLen; bool large = context->large; + bool lift = context->unliftedIndex != NULL; // // Batch the insertions into the hash tables, because otherwise we spend all of @@ -1200,9 +1361,19 @@ GenomeIndex::BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context) continue; } - Seed seed(bases, seedLen); + Seed seed(bases, seedLen); - indexSeed(genomeLocation, seed, batches, context, &stats, large); + if (!lift) { + indexSeed(genomeLocation, seed, batches, context, &stats, large); + } else { + // in the lifted case, we first do one pass to index lifted seeds + // and then another pass to sort the unlifted locations by the lifted locations so they correspond + if (context->liftedIndexPass == 0) { + indexLiftedSeed(genomeLocation, seed, batches, context, &stats, large); + } else { + resortLiftedSeed(genomeLocation, seed, batches, context, &stats, large); + } + } } // For each genome base in our area // @@ -1224,10 +1395,8 @@ GenomeIndex::BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context) } } - -const _int64 GenomeIndex::printPeriod = 100000000; - +const _int64 GenomeIndex::printPeriod = 100000000; void GenomeIndex::indexSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large) @@ -1241,22 +1410,174 @@ GenomeIndex::indexSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBat _ASSERT(whichHashTable < nHashTables); if (batches[whichHashTable].addSeed(genomeLocation, seed.getLowBases(context->hashTableKeySize), usingComplement)) { - AcquireExclusiveLock(&context->hashTableLocks[whichHashTable]); - for (unsigned i = 0; i < batches[whichHashTable].nUsed; i++) { - ApplyHashTableUpdate(context, whichHashTable, batches[whichHashTable].entries[i].genomeLocation, - batches[whichHashTable].entries[i].lowBases, batches[whichHashTable].entries[i].usingComplement, - &stats->bothComplementsUsed, &stats->genomeLocationsInOverflowTable, &stats->seedsWithMultipleOccurrences, large); - } - ReleaseExclusiveLock(&context->hashTableLocks[whichHashTable]); + AcquireExclusiveLock(&context->hashTableLocks[whichHashTable]); + for (unsigned i = 0; i < batches[whichHashTable].nUsed; i++) { + ApplyHashTableUpdate(context, whichHashTable, batches[whichHashTable].entries[i].genomeLocation, + batches[whichHashTable].entries[i].lowBases, batches[whichHashTable].entries[i].usingComplement, + &stats->bothComplementsUsed, &stats->genomeLocationsInOverflowTable, &stats->seedsWithMultipleOccurrences, large); + } + ReleaseExclusiveLock(&context->hashTableLocks[whichHashTable]); - _int64 newNBasesProcessed = InterlockedAdd64AndReturnNewValue(context->nBasesProcessed, batches[whichHashTable].nUsed + stats->unrecordedSkippedSeeds); + _int64 newNBasesProcessed = InterlockedAdd64AndReturnNewValue(context->nBasesProcessed, batches[whichHashTable].nUsed + stats->unrecordedSkippedSeeds); - if ((unsigned)(newNBasesProcessed / printPeriod) > (unsigned)((newNBasesProcessed - batches[whichHashTable].nUsed - stats->unrecordedSkippedSeeds) / printPeriod)) { - WriteStatusMessage("Indexing %lld / %lld\n", (newNBasesProcessed / printPeriod) * printPeriod, context->genome->getCountOfBases()); - } - stats->unrecordedSkippedSeeds = 0; - batches[whichHashTable].clear(); - } // If we filled a batch + if ((unsigned)(newNBasesProcessed / printPeriod) >(unsigned)((newNBasesProcessed - batches[whichHashTable].nUsed - stats->unrecordedSkippedSeeds) / printPeriod)) { + WriteStatusMessage("Indexing %lld / %lld\n", (newNBasesProcessed / printPeriod) * printPeriod, context->genome->getCountOfBases()); + } + stats->unrecordedSkippedSeeds = 0; + batches[whichHashTable].clear(); + } // If we filled a batch +} + + void +GenomeIndex::indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large) +{ + // if this is first occurrence of seed in unlifted index, checks if seed is in any alts + // and if so, adds all locations to this index, lifting alts to non-alt locations + + _int64 nHits, nRCHits; + if (doesGenomeIndexHave64BitLocations()) { + const GenomeLocation *hits, *rcHits; + GenomeLocation singleHit[2], singleRCHit[2]; + context->unliftedIndex->lookupSeed(seed, &nHits, &hits, &nRCHits, &rcHits, &singleHit[1], &singleRCHit[1]); +#define CHECK_ALTS_AND_ADD_LIFTED \ + if ((nHits > 0 && genomeLocation == *hits && (nRCHits == 0 || *hits <= *rcHits)) || \ + (nRCHits > 0 && genomeLocation == *rcHits && (nHits == 0 || *rcHits < *hits))) { \ + bool anyAlts = false; \ + for (int i = 0; i < nHits && !anyAlts; i++) { \ + anyAlts = genome->isAltLocation(hits[i]); \ + } \ + for (int i = 0; i < nRCHits && !anyAlts; i++) { \ + anyAlts = genome->isAltLocation(rcHits[i]); \ + } \ + if (anyAlts) { \ + for (int i = 0; i < nHits; i++) { \ + indexSeed(genome->getLiftedLocation(hits[i]), seed, batches, context, stats, large); \ + } \ + if (!seed.isOwnReverseComplement()) { \ + Seed rcSeed = ~seed; \ + for (int i = 0; i < nRCHits; i++) { \ + indexSeed(genome->getLiftedLocation(rcHits[i]), rcSeed, batches, context, stats, large); \ + } \ + } \ + } \ + } + CHECK_ALTS_AND_ADD_LIFTED + } + else { + const unsigned *hits, *rcHits; + context->unliftedIndex->lookupSeed32(seed, &nHits, &hits, &nRCHits, &rcHits); + CHECK_ALTS_AND_ADD_LIFTED + } +} +#undef CHECK_ALTS_AND_ADD_LIFTED + + void +dualBackwardsSort32( + _int64 n, + unsigned* keys, + unsigned* values) +{ + // todo: optimize sorting, just using a simple selection sort for now + unsigned t; +#define DUAL_BACKWARDS_SORT \ + if (n < 2) { \ + return; \ + } \ + for (_int64 i = 0; i < n - 1; i++) { \ + for (_int64 j = n - 1; j > i; j--) { \ + if (keys[i] < keys[j]) { \ + t = keys[i]; \ + keys[i] = keys[j]; \ + keys[j] = t; \ + t = values[i]; \ + values[i] = values[j]; \ + values[j] = t; \ + } \ + } \ + } + DUAL_BACKWARDS_SORT +} + + void +dualBackwardsSort( + _int64 n, + GenomeLocation* keys, + GenomeLocation* values) +{ + GenomeLocation t; + DUAL_BACKWARDS_SORT +} +#undef DUAL_BACKWARDS_SORT + + void +GenomeIndex::resortLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large) +{ + // redo lifting and then sort both lists by lifted location + // NOTE: this leaves the unlifted list in a non-sorted order + + _int64 nHits, nRCHits; + _int64 nLiftedHits, nLiftedRCHits; + if (doesGenomeIndexHave64BitLocations()) { + const GenomeLocation *hits, *rcHits; + GenomeLocation singleHit[2], singleRCHit[2]; + const GenomeLocation *liftedHits, *liftedRCHits; + GenomeLocation liftedSingleHit[2], liftedSingleRCHit[2]; + lookupSeed(seed, &nLiftedHits, &liftedHits, &nLiftedRCHits, &liftedRCHits, &liftedSingleHit[1], &liftedSingleRCHit[1]); + if (nLiftedHits > 1 || nLiftedRCHits > 1) { + context->unliftedIndex->lookupSeed(seed, &nHits, &hits, &nRCHits, &rcHits, &singleHit[1], &singleRCHit[1]); + _ASSERT(nLiftedHits == nHits && nLiftedRCHits == nRCHits); + if ((nHits > 1 && genomeLocation == hits[0]) || (nRCHits > 1 && genomeLocation == rcHits[0])) { + // re-lift unlifted so that the order corresponds, then sort both by lifted location + for (int i = 0; i < nHits; i++) { + ((GenomeLocation*)liftedHits)[i] = genome->getLiftedLocation(hits[i]); + } + for (int i = 0; i < nRCHits; i++) { + ((GenomeLocation*)liftedRCHits)[i] = genome->getLiftedLocation(rcHits[i]); + } + dualBackwardsSort(nHits, (GenomeLocation*)liftedHits, (GenomeLocation*)hits); + dualBackwardsSort(nRCHits, (GenomeLocation*)liftedRCHits, (GenomeLocation*)rcHits); +#ifdef _DEBUG + for (int i = 0; i < nHits; i++) { + _ASSERT(genome->getLiftedLocation(hits[i]) == liftedHits[i]); + _ASSERT(i == 0 || liftedHits[i - 1] >= liftedHits[i]); + } + for (int i = 0; i < nRCHits; i++) { + _ASSERT(genome->getLiftedLocation(rcHits[i]) == liftedRCHits[i]); + _ASSERT(i == 0 || liftedRCHits[i - 1] >= liftedRCHits[i]); + } +#endif + } + } + } else { + const unsigned *hits, *rcHits; + const unsigned *liftedHits, *liftedRCHits; + lookupSeed32(seed, &nLiftedHits, &liftedHits, &nLiftedRCHits, &liftedRCHits); + if (nLiftedHits > 1 || nLiftedRCHits > 1) { + context->unliftedIndex->lookupSeed32(seed, &nHits, &hits, &nRCHits, &rcHits); + _ASSERT(nLiftedHits == nHits && nLiftedRCHits == nRCHits); + if ((nHits > 0 && genomeLocation == hits[0]) || (nRCHits > 0 && genomeLocation == rcHits[0])) { + // re-lift unlifted so that the order corresponds, then sort both by lifted location + for (int i = 0; i < nHits; i++) { + ((unsigned*)liftedHits)[i] = GenomeLocationAsInt32(genome->getLiftedLocation(hits[i])); + } + for (int i = 0; i < nRCHits; i++) { + ((unsigned*)liftedRCHits)[i] = GenomeLocationAsInt32(genome->getLiftedLocation(rcHits[i])); + } + dualBackwardsSort32(nHits, (unsigned*)liftedHits, (unsigned*)hits); + dualBackwardsSort32(nRCHits, (unsigned*)liftedRCHits, (unsigned*)rcHits); +#ifdef _DEBUG + for (int i = 0; i < nHits; i++) { + _ASSERT(genome->getLiftedLocation(hits[i]) == liftedHits[i]); + _ASSERT(i == 0 || liftedHits[i - 1] >= liftedHits[i]); + } + for (int i = 0; i < nRCHits; i++) { + _ASSERT(genome->getLiftedLocation(rcHits[i]) == liftedRCHits[i]); + _ASSERT(i == 0 || liftedRCHits[i - 1] >= liftedRCHits[i]); + } +#endif + } + } + } } void @@ -1606,7 +1927,7 @@ GenomeIndex::printBiasTables() } GenomeIndex * -GenomeIndex::loadFromDirectory(char *directoryName, bool map, bool prefetch) +GenomeIndex::loadFromDirectory(char *directoryName, bool map, bool prefetch, bool liftedIndex) { int filenameBufferSize = (int)(strlen(directoryName) + 1 + __max(strlen(GenomeIndexFileName), __max(strlen(OverflowTableFileName), __max(strlen(GenomeIndexHashFileName), strlen(GenomeFileName)))) + 1); char *filenameBuffer = new char[filenameBufferSize]; @@ -1649,7 +1970,7 @@ GenomeIndex::loadFromDirectory(char *directoryName, bool map, bool prefetch) indexFile->close(); delete indexFile; - if (majorVersion != GenomeIndexFormatMajorVersion) { + if (majorVersion != GenomeIndexFormatMajorVersion && majorVersion != GenomeIndexFormatMajorVersionWithoutAlts) { WriteErrorMessage("This genome index appears to be from a different version of SNAP than this, and so we can't read it. Index version %d, SNAP index format version %d\n", majorVersion, GenomeIndexFormatMajorVersion); soft_exit(1); @@ -1829,22 +2150,40 @@ GenomeIndex::loadFromDirectory(char *directoryName, bool map, bool prefetch) blobFile = NULL; } - snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, GenomeFileName); - if (NULL == (index->genome = Genome::loadFromFile(filenameBuffer, chromosomePadding, 0, 0, map))) { - WriteErrorMessage("GenomeIndex::loadFromDirectory: Failed to load the genome itself\n"); - delete[] filenameBuffer; - delete index; - return NULL; - } + if (!liftedIndex) { + snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, GenomeFileName); + if (NULL == (index->genome = Genome::loadFromFile(filenameBuffer, chromosomePadding, 0, 0, map))) { + WriteErrorMessage("GenomeIndex::loadFromDirectory: Failed to load the genome itself\n"); + delete[] filenameBuffer; + delete index; + return NULL; + } - if ((_int64)index->genome->getCountOfBases() + (_int64)index->overflowTableSize > 0xfffffff0 && locationSize == 4) { - WriteErrorMessage("\nThis index has too many overflow entries to be valid. Some early versions of SNAP\n" - "allowed building indices with too small of a seed size, and this appears to be such\n" - "an index. You can no longer build indices like this, and you also can't use them\n" - "because they are corrupt and would produce incorrect results. Please use an index\n" - "built with a larger seed size. For hg19, the seed size must be at least 19.\n" - "For other reference genomes this quantity will vary.\n"); - soft_exit(1); + if ((_int64)index->genome->getCountOfBases() + (_int64)index->overflowTableSize > 0xfffffff0 && locationSize == 4) { + WriteErrorMessage("\nThis index has too many overflow entries to be valid. Some early versions of SNAP\n" + "allowed building indices with too small of a seed size, and this appears to be such\n" + "an index. You can no longer build indices like this, and you also can't use them\n" + "because they are corrupt and would produce incorrect results. Please use an index\n" + "built with a larger seed size. For hg19, the seed size must be at least 19.\n" + "For other reference genomes this quantity will vary.\n"); + soft_exit(1); + } + + if (index->genome->hasAltContigs()) { + snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, LiftedIndexDirName); + index->liftedIndex = loadFromDirectory(filenameBuffer, map, prefetch, true); + if (index->liftedIndex == NULL) { + WriteErrorMessage("Missing alt index directory %s\n", filenameBuffer); + soft_exit(1); + } + index->liftedIndex->genome = index->genome; + index->hasAlts = true; + } else { + index->liftedIndex = NULL; + } + } else { + index->hasAlts = true; + index->genome = NULL; } delete[] filenameBuffer; @@ -1857,7 +2196,7 @@ GenomeIndex::lookupSeed32( _int64 *nHits, const unsigned **hits, _int64 *nRCHits, - const unsigned **rcHits) + const unsigned **rcHits) const { _ASSERT(locationSize == 4); // This is the caller's responsibility to check. @@ -1915,11 +2254,37 @@ GenomeIndex::lookupSeed32( } } + + void +GenomeIndex::lookupSeedAlt32( + Seed seed, + _int64 *nHits, + const unsigned **hits, + _int64 *nRCHits, + const unsigned **rcHits, + const unsigned **unliftedHits, + const unsigned **unliftedRCHits) +{ + lookupSeed32(seed, nHits, hits, nRCHits, rcHits); + *unliftedHits = *hits; + *unliftedRCHits = *rcHits; + if (hasAlts) { + _int64 nLiftedHits, nLiftedRCHits; + const unsigned *liftedHits, *liftedRCHits; + liftedIndex->lookupSeed32(seed, &nLiftedHits, &liftedHits, &nLiftedRCHits, &liftedRCHits); + if (nLiftedHits != 0 || nLiftedRCHits != 0) { + _ASSERT(nLiftedHits == *nHits && nLiftedRCHits == *nRCHits); + *hits = liftedHits; + *rcHits = liftedRCHits; + } + } +} + void GenomeIndex::fillInLookedUpResults32( const unsigned *subEntry, _int64 *nHits, - const unsigned **hits) + const unsigned **hits) const { // // WARNING: the code in the IntersectingPairedEndAligner relies on being able to look at @@ -1968,7 +2333,7 @@ GenomeIndex::lookupSeed( _int64 * nRCHits, const GenomeLocation ** rcHits, GenomeLocation * singleHit, - GenomeLocation * singleRCHit) + GenomeLocation * singleRCHit) const { _ASSERT(locationSize > 4 && locationSize <= 8); @@ -2041,9 +2406,36 @@ GenomeIndex::lookupSeed( } } + void +GenomeIndex::lookupSeedAlt( + Seed seed, + _int64 * nHits, + const GenomeLocation ** hits, + _int64 * nRCHits, + const GenomeLocation ** rcHits, + const GenomeLocation ** unliftedHits, + const GenomeLocation ** unliftedRCHits, + GenomeLocation * singleHit, + GenomeLocation * singleRCHit) +{ + // todo: implement + lookupSeed(seed, nHits, hits, nRCHits, rcHits, singleHit, singleRCHit); + *unliftedHits = *hits; + *unliftedRCHits = *rcHits; + if (hasAlts) { + _int64 nLiftedHits, nLiftedRCHits; + const GenomeLocation *liftedHits, *liftedRCHits; + liftedIndex->lookupSeed(seed, &nLiftedHits, &liftedHits, &nLiftedRCHits, &liftedRCHits, singleHit + 1, singleRCHit + 1); + if (nLiftedHits != 0 || nLiftedRCHits != 0) { + _ASSERT(nLiftedHits == *nHits && nLiftedRCHits == *nRCHits); + *hits = liftedHits; + *rcHits = liftedRCHits; + } + } +} void -GenomeIndex::fillInLookedUpResults(GenomeLocation lookedUpLocation, _int64 *nHits, const GenomeLocation **hits, GenomeLocation *singleHitLocation) +GenomeIndex::fillInLookedUpResults(GenomeLocation lookedUpLocation, _int64 *nHits, const GenomeLocation **hits, GenomeLocation *singleHitLocation) const { // // WARNING: the code in the IntersectingPairedEndAligner relies on being able to look at diff --git a/SNAPLib/GenomeIndex.h b/SNAPLib/GenomeIndex.h index 21654618..86a28c4f 100644 --- a/SNAPLib/GenomeIndex.h +++ b/SNAPLib/GenomeIndex.h @@ -47,8 +47,15 @@ class GenomeIndex { // be pointed to as a return value. When only a single hit is returned, *hits == singleHit, so there's // no need to check on the caller's side. // - void lookupSeed(Seed seed, _int64 *nHits, const GenomeLocation **hits, _int64 *nRCHits, const GenomeLocation **rcHits, GenomeLocation *singleHit, GenomeLocation *singleRCHit); - void lookupSeed32(Seed seed, _int64 *nHits, const unsigned **hits, _int64 *nRCHits, const unsigned **rcHits); + void lookupSeed(Seed seed, _int64 *nHits, const GenomeLocation **hits, _int64 *nRCHits, const GenomeLocation **rcHits, GenomeLocation *singleHit, GenomeLocation *singleRCHit) const; + void lookupSeed32(Seed seed, _int64 *nHits, const unsigned **hits, _int64 *nRCHits, const unsigned **rcHits) const; + + // versions for genome that has alt regions + // hits/rcHits locations are lifted to non-alt contigs, unliftedHits/unliftedRCHits are original locations in alt contigs + // nHits/nRCHits is the same for both sets + // *hits==*unliftedHits && *rcHits==*unliftedRCHits iff seed has no alt hits + void lookupSeedAlt(Seed seed, _int64 *nHits, const GenomeLocation **hits, _int64 *nRCHits, const GenomeLocation **rcHits, const GenomeLocation **unliftedHits, const GenomeLocation **unliftedRCHits, GenomeLocation *singleHit, GenomeLocation *singleRCHit); + void lookupSeedAlt32(Seed seed, _int64 *nHits, const unsigned **hits, _int64 *nRCHits, const unsigned **rcHits, const unsigned **unliftedHits, const unsigned **unliftedRCHits); bool doesGenomeIndexHave64BitLocations() const {return locationSize > 4;} @@ -75,7 +82,7 @@ class GenomeIndex { // static void runIndexer(int argc, const char **argv); - static GenomeIndex *loadFromDirectory(char *directoryName, bool map, bool prefetch); + static GenomeIndex *loadFromDirectory(char *directoryName, bool map, bool prefetch, bool liftedIndex = false); static void printBiasTables(); @@ -86,6 +93,12 @@ class GenomeIndex { unsigned nHashTables; const Genome *genome; + // TRUE if genome has alt contigs + bool hasAlts; + + // secondary index for all seeds that map to alt contigs with locations lifted to non-alt contigs + GenomeIndex* liftedIndex; + bool largeHashTable; unsigned locationSize; @@ -147,12 +160,13 @@ class GenomeIndex { // Build a genome index and write it to a directory. If you don't already have a saved index // the only way to get one is to build it into a directory and then load it from the directory. // NB: This deletes the Genome that's passed into it. + // unliftedIndex is an internal parameter used to build 2-level index for genomes with alt contigs // static bool BuildIndexToDirectory(const Genome *genome, int seedLen, double slack, - bool computeBias, const char *directory, + bool computeBias, const char *directoryName, unsigned maxThreads, unsigned chromosomePaddingSize, bool forceExact, unsigned hashTableKeySize, bool large, const char *histogramFileName, - unsigned locationSize, bool smallMemory); + unsigned locationSize, bool smallMemory, GenomeIndex *unliftedIndex = NULL); // @@ -161,15 +175,17 @@ class GenomeIndex { static SNAPHashTable** allocateHashTables(unsigned* o_nTables, GenomeDistance countOfBases, double slack, int seedLen, unsigned hashTableKeySize, bool large, unsigned locationSize, double* biasTable = NULL); - static const unsigned GenomeIndexFormatMajorVersion = 5; + static const unsigned GenomeIndexFormatMajorVersion = 6; static const unsigned GenomeIndexFormatMinorVersion = 0; - + // NOTE: this must be changed if the format no longer supports v5 (pre-alt) + static const unsigned GenomeIndexFormatMajorVersionWithoutAlts = 5; + static const unsigned largestBiasTable = 32; // Can't be bigger than the biggest seed size, which is set in Seed.h. Bigger than 32 means a new Seed structure. static const unsigned largestKeySize = 8; static double *hg19_biasTables[largestKeySize+1][largestBiasTable+1]; static double *hg19_biasTables_large[largestKeySize+1][largestBiasTable+1]; - static void ComputeBiasTable(const Genome* genome, int seedSize, double* table, unsigned maxThreads, bool forceExact, unsigned hashTableKeySize, bool large); + static void ComputeBiasTable(const Genome* genome, int seedSize, double* table, unsigned maxThreads, bool forceExact, unsigned hashTableKeySize, bool large, const GenomeIndex* unliftedIndex = NULL); struct ComputeBiasTableThreadContext { SingleWaiterObject *doneObject; @@ -184,12 +200,15 @@ class GenomeIndex { unsigned seedLen; volatile _int64 *validSeeds; bool large; + const GenomeIndex *unliftedIndex; ExclusiveLock *approximateCounterLocks; }; static void ComputeBiasTableWorkerThreadMain(void *param); + bool hasAnyAltHitsAndLocationIsFirst(Seed seed, GenomeLocation genomeLocation, bool large) const; + struct OverflowBackpointer; struct BuildHashTablesThreadContext { @@ -226,6 +245,10 @@ class GenomeIndex { ExclusiveLock *backpointerSpillLock; FILE *backpointerSpillFile; + // used for building sub-index of only seeds that occur in alt contigs + GenomeIndex *unliftedIndex; + int liftedIndexPass; + ExclusiveLock *hashTableLocks; ExclusiveLock *overflowTableLock; }; @@ -274,8 +297,10 @@ class GenomeIndex { static const _int64 printPeriod; virtual void indexSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large); + virtual void indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large); + virtual void resortLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large); virtual void completeIndexing(PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large); - + static void BuildHashTablesWorkerThreadMain(void *param); void BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context); static void ApplyHashTableUpdate(BuildHashTablesThreadContext *context, _uint64 whichHashTable, GenomeLocation genomeLocation, _uint64 lowBases, bool usingComplement, @@ -294,6 +319,6 @@ class GenomeIndex { BuildHashTablesThreadContext*context, GenomeLocation genomeLocation); - void fillInLookedUpResults32(const unsigned *subEntry, _int64 *nHits, const unsigned **hits); - void fillInLookedUpResults(GenomeLocation lookedUpLocation, _int64 *nHits, const GenomeLocation **hits, GenomeLocation *singleHitLocation); + void fillInLookedUpResults32(const unsigned *subEntry, _int64 *nHits, const unsigned **hits) const; + void fillInLookedUpResults(GenomeLocation lookedUpLocation, _int64 *nHits, const GenomeLocation **hits, GenomeLocation *singleHitLocation) const; }; diff --git a/SNAPLib/IntersectingPairedEndAligner.cpp b/SNAPLib/IntersectingPairedEndAligner.cpp index be94b4ec..1c5087df 100644 --- a/SNAPLib/IntersectingPairedEndAligner.cpp +++ b/SNAPLib/IntersectingPairedEndAligner.cpp @@ -33,6 +33,9 @@ Revision History: extern bool _DumpAlignments; // From BaseAligner.cpp #endif // _DEBUG +static const double EPSILON_FACTOR_HI = 1.0000000001; +static const double EPSILON_FACTOR_LO = 0.9999999999; + IntersectingPairedEndAligner::IntersectingPairedEndAligner( GenomeIndex *index_, unsigned maxReadSize_, @@ -56,6 +59,7 @@ IntersectingPairedEndAligner::IntersectingPairedEndAligner( maxSecondaryAlignmentsPerContig(maxSecondaryAlignmentsPerContig_) { doesGenomeIndexHave64BitLocations = index->doesGenomeIndexHave64BitLocations(); + doesGenomeIndexHaveAlts = index->getGenome()->hasAltContigs(); unsigned maxSeedsToUse; if (0 != numSeedsFromCommandLine) { @@ -182,7 +186,7 @@ IntersectingPairedEndAligner::align( #ifdef _DEBUG if (_DumpAlignments) { - printf("\nIntersectingAligner aligning reads '%*.s' and '%.*s' with data '%.*s' and '%.*s'\n", read0->getIdLength(), read0->getId(), read1->getIdLength(), read1->getId(), read0->getDataLength(), read0->getData(), read1->getDataLength(), read1->getData()); + printf("\nIntersectingAligner aligning reads '%.*s' and '%.*s' with data '%.*s' and '%.*s'\n", read0->getIdLength(), read0->getId(), read1->getIdLength(), read1->getId(), read0->getDataLength(), read0->getData(), read1->getDataLength(), read1->getData()); } #endif // _DEBUG @@ -201,6 +205,7 @@ IntersectingPairedEndAligner::align( GenomeLocation bestResultGenomeLocation[NUM_READS_PER_PAIR]; Direction bestResultDirection[NUM_READS_PER_PAIR]; unsigned bestResultScore[NUM_READS_PER_PAIR]; + bool bestPairHasAlts = false; unsigned popularSeedsSkipped[NUM_READS_PER_PAIR]; reads[0][FORWARD] = read0; @@ -335,12 +340,23 @@ IntersectingPairedEndAligner::align( _int64 nHits[NUM_DIRECTIONS]; const GenomeLocation *hits[NUM_DIRECTIONS]; const unsigned *hits32[NUM_DIRECTIONS]; + const GenomeLocation *unliftedHits[NUM_DIRECTIONS]; + const unsigned *unliftedHits32[NUM_DIRECTIONS]; - if (doesGenomeIndexHave64BitLocations) { - index->lookupSeed(seed, &nHits[FORWARD], &hits[FORWARD], &nHits[RC], &hits[RC], - hashTableHitSets[whichRead][FORWARD]->getNextSingletonLocation(), hashTableHitSets[whichRead][RC]->getNextSingletonLocation()); + if (!doesGenomeIndexHaveAlts) { + if (doesGenomeIndexHave64BitLocations) { + index->lookupSeed(seed, &nHits[FORWARD], &hits[FORWARD], &nHits[RC], &hits[RC], + hashTableHitSets[whichRead][FORWARD]->getNextSingletonLocation(), hashTableHitSets[whichRead][RC]->getNextSingletonLocation()); + } else { + index->lookupSeed32(seed, &nHits[FORWARD], &hits32[FORWARD], &nHits[RC], &hits32[RC]); + } } else { - index->lookupSeed32(seed, &nHits[FORWARD], &hits32[FORWARD], &nHits[RC], &hits32[RC]); + if (doesGenomeIndexHave64BitLocations) { + index->lookupSeedAlt(seed, &nHits[FORWARD], &hits[FORWARD], &nHits[RC], &hits[RC], &unliftedHits[FORWARD], &unliftedHits[RC], + hashTableHitSets[whichRead][FORWARD]->getNextSingletonLocation(), hashTableHitSets[whichRead][RC]->getNextSingletonLocation()); + } else { + index->lookupSeedAlt32(seed, &nHits[FORWARD], &hits32[FORWARD], &nHits[RC], &hits32[RC], &unliftedHits32[FORWARD], &unliftedHits32[RC]); + } } countOfHashTableLookups[whichRead]++; @@ -353,10 +369,18 @@ IntersectingPairedEndAligner::align( } if (nHits[dir] < maxBigHits) { totalHashTableHits[whichRead][dir] += nHits[dir]; - if (doesGenomeIndexHave64BitLocations) { - hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits[dir], beginsDisjointHitSet[dir]); + if (!doesGenomeIndexHaveAlts) { + if (doesGenomeIndexHave64BitLocations) { + hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits[dir], NULL, beginsDisjointHitSet[dir]); + } else { + hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits32[dir], NULL, beginsDisjointHitSet[dir]); + } } else { - hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits32[dir], beginsDisjointHitSet[dir]); + if (doesGenomeIndexHave64BitLocations) { + hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits[dir], unliftedHits[dir], beginsDisjointHitSet[dir]); + } else { + hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits32[dir], unliftedHits32[dir], beginsDisjointHitSet[dir]); + } } beginsDisjointHitSet[dir]= false; } else { @@ -388,7 +412,6 @@ IntersectingPairedEndAligner::align( Direction setPairDirection[NUM_SET_PAIRS][NUM_READS_PER_PAIR] = {{FORWARD, RC}, {RC, FORWARD}}; - // // Phase 2: find all possible candidates and add them to candidate lists (for the reads with fewer and more hits). // @@ -409,6 +432,10 @@ IntersectingPairedEndAligner::align( unsigned lastSeedOffsetForReadWithFewerHits; GenomeLocation lastGenomeLocationForReadWithFewerHits; GenomeLocation lastGenomeLocationForReadWithMoreHits; + GenomeLocation lastUnliftedGenomeLocationForReadWithFewerHits; + GenomeLocation lastUnliftedGenomeLocationForReadWithMoreHits; + GenomeLocation *pLastUnliftedGenomeLocationForReadWithFewerHits = doesGenomeIndexHaveAlts ? &lastUnliftedGenomeLocationForReadWithFewerHits : NULL; + GenomeLocation *pLastUnliftedGenomeLocationForReadWithMoreHits = doesGenomeIndexHaveAlts ? &lastUnliftedGenomeLocationForReadWithMoreHits : NULL; unsigned lastSeedOffsetForReadWithMoreHits; bool outOfMoreHitsLocations = false; @@ -416,13 +443,13 @@ IntersectingPairedEndAligner::align( // // Seed the intersection state by doing a first lookup. // - if (setPair[readWithFewerHits]->getFirstHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits)) { + if (setPair[readWithFewerHits]->getFirstHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits, pLastUnliftedGenomeLocationForReadWithFewerHits)) { // // No hits in this direction. // continue; // The outer loop over set pairs. } - + _ASSERT(pLastUnliftedGenomeLocationForReadWithFewerHits == NULL || genome->getLiftedLocation(*pLastUnliftedGenomeLocationForReadWithFewerHits) == lastGenomeLocationForReadWithFewerHits); lastGenomeLocationForReadWithMoreHits = InvalidGenomeLocation; // @@ -444,9 +471,10 @@ IntersectingPairedEndAligner::align( // location that's not too high. // if (!setPair[readWithMoreHits]->getNextHitLessThanOrEqualTo(lastGenomeLocationForReadWithFewerHits + maxSpacing, - &lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits)) { + &lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits, pLastUnliftedGenomeLocationForReadWithMoreHits)) { break; // End of all of the mates. We're done with this set pair. } + _ASSERT(pLastUnliftedGenomeLocationForReadWithMoreHits == NULL || genome->getLiftedLocation(*pLastUnliftedGenomeLocationForReadWithMoreHits) == lastGenomeLocationForReadWithMoreHits); } if ((lastGenomeLocationForReadWithMoreHits + maxSpacing < lastGenomeLocationForReadWithFewerHits || outOfMoreHitsLocations) && @@ -463,12 +491,13 @@ IntersectingPairedEndAligner::align( } if (!setPair[readWithFewerHits]->getNextHitLessThanOrEqualTo(lastGenomeLocationForReadWithMoreHits + maxSpacing, &lastGenomeLocationForReadWithFewerHits, - &lastSeedOffsetForReadWithFewerHits)) { + &lastSeedOffsetForReadWithFewerHits, pLastUnliftedGenomeLocationForReadWithFewerHits)) { // // No more candidates on the read with fewer hits side. We're done with this set pair. // break; } + _ASSERT(pLastUnliftedGenomeLocationForReadWithFewerHits == NULL || genome->getLiftedLocation(*pLastUnliftedGenomeLocationForReadWithFewerHits) == lastGenomeLocationForReadWithFewerHits); continue; } @@ -490,12 +519,13 @@ IntersectingPairedEndAligner::align( soft_exit(1); } scoringMateCandidates[whichSetPair][lowestFreeScoringMateCandidate[whichSetPair]].init( - lastGenomeLocationForReadWithMoreHits, bestPossibleScoreForReadWithMoreHits, lastSeedOffsetForReadWithMoreHits); + lastGenomeLocationForReadWithMoreHits, bestPossibleScoreForReadWithMoreHits, lastSeedOffsetForReadWithMoreHits, + doesGenomeIndexHaveAlts ? lastUnliftedGenomeLocationForReadWithMoreHits : lastGenomeLocationForReadWithMoreHits); #ifdef _DEBUG if (_DumpAlignments) { - printf("SetPair %d, added more hits candidate %d at genome location %u, bestPossibleScore %d, seedOffset %d\n", - whichSetPair, lowestFreeScoringMateCandidate[whichSetPair], lastGenomeLocationForReadWithMoreHits, + printf("SetPair %d, added more hits candidate %d at genome location %u(%u), bestPossibleScore %d, seedOffset %d\n", + whichSetPair, lowestFreeScoringMateCandidate[whichSetPair], lastGenomeLocationForReadWithMoreHits, lastUnliftedGenomeLocationForReadWithMoreHits, bestPossibleScoreForReadWithMoreHits, lastSeedOffsetForReadWithMoreHits); } @@ -504,12 +534,12 @@ IntersectingPairedEndAligner::align( lowestFreeScoringMateCandidate[whichSetPair]++; previousMoreHitsLocation = lastGenomeLocationForReadWithMoreHits; - - if (!setPair[readWithMoreHits]->getNextLowerHit(&lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits)) { + if (!setPair[readWithMoreHits]->getNextLowerHit(&lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits, pLastUnliftedGenomeLocationForReadWithMoreHits)) { lastGenomeLocationForReadWithMoreHits = 0; outOfMoreHitsLocations = true; break; // out of the loop looking for candidates on the more hits side. } + _ASSERT(pLastUnliftedGenomeLocationForReadWithMoreHits == NULL || genome->getLiftedLocation(*pLastUnliftedGenomeLocationForReadWithMoreHits) == lastGenomeLocationForReadWithMoreHits); } // @@ -550,15 +580,16 @@ IntersectingPairedEndAligner::align( scoringCandidatePool[lowestFreeScoringCandidatePoolEntry].init(lastGenomeLocationForReadWithFewerHits, whichSetPair, lowestFreeScoringMateCandidate[whichSetPair] - 1, lastSeedOffsetForReadWithFewerHits, bestPossibleScoreForReadWithFewerHits, - scoringCandidates[bestPossibleScore]); + scoringCandidates[bestPossibleScore], + doesGenomeIndexHaveAlts ? lastUnliftedGenomeLocationForReadWithFewerHits : lastGenomeLocationForReadWithFewerHits); scoringCandidates[bestPossibleScore] = &scoringCandidatePool[lowestFreeScoringCandidatePoolEntry]; #ifdef _DEBUG if (_DumpAlignments) { - printf("SetPair %d, added fewer hits candidate %d at genome location %u, bestPossibleScore %d, seedOffset %d\n", - whichSetPair, lowestFreeScoringCandidatePoolEntry, lastGenomeLocationForReadWithFewerHits, + printf("SetPair %d, added fewer hits candidate %d at genome location %u(%u), bestPossibleScore %d, seedOffset %d\n", + whichSetPair, lowestFreeScoringCandidatePoolEntry, lastGenomeLocationForReadWithFewerHits, lastUnliftedGenomeLocationForReadWithFewerHits, lowestBestPossibleScoreOfAnyPossibleMate + bestPossibleScoreForReadWithFewerHits, lastSeedOffsetForReadWithFewerHits); } @@ -568,9 +599,10 @@ IntersectingPairedEndAligner::align( maxUsedBestPossibleScoreList = max(maxUsedBestPossibleScoreList, bestPossibleScore); } - if (!setPair[readWithFewerHits]->getNextLowerHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits)) { + if (!setPair[readWithFewerHits]->getNextLowerHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits, pLastUnliftedGenomeLocationForReadWithFewerHits)) { break; } + _ASSERT(pLastUnliftedGenomeLocationForReadWithFewerHits == NULL || genome->getLiftedLocation(*pLastUnliftedGenomeLocationForReadWithFewerHits) == lastGenomeLocationForReadWithFewerHits); } } // For each set pair @@ -602,15 +634,15 @@ IntersectingPairedEndAligner::align( double fewerEndMatchProbability; int fewerEndGenomeLocationOffset; - scoreLocation(readWithFewerHits, setPairDirection[candidate->whichSetPair][readWithFewerHits], candidate->readWithFewerHitsGenomeLocation, + scoreLocation(readWithFewerHits, setPairDirection[candidate->whichSetPair][readWithFewerHits], candidate->readWithFewerHitsUnliftedGenomeLocation, candidate->seedOffset, scoreLimit, &fewerEndScore, &fewerEndMatchProbability, &fewerEndGenomeLocationOffset); _ASSERT(-1 == fewerEndScore || fewerEndScore >= candidate->bestPossibleScore); #ifdef _DEBUG if (_DumpAlignments) { - printf("Scored fewer end candidate %d, set pair %d, read %d, location %u, seed offset %d, score limit %d, score %d, offset %d\n", (int)(candidate - scoringCandidatePool), - candidate->whichSetPair, readWithFewerHits, candidate->readWithFewerHitsGenomeLocation, candidate->seedOffset, + printf("Scored fewer end candidate %d, set pair %d, read %d, location %u(%u), seed offset %d, score limit %d, score %d, offset %d\n", (int)(candidate - scoringCandidatePool), + candidate->whichSetPair, readWithFewerHits, candidate->readWithFewerHitsGenomeLocation, candidate->readWithFewerHitsUnliftedGenomeLocation, candidate->seedOffset, scoreLimit, fewerEndScore, fewerEndGenomeLocationOffset); } #endif // DEBUG @@ -635,13 +667,13 @@ IntersectingPairedEndAligner::align( // use now, score it. // if (mate->score == -2 || mate->score == -1 && mate->scoreLimit < scoreLimit - fewerEndScore) { - scoreLocation(readWithMoreHits, setPairDirection[candidate->whichSetPair][readWithMoreHits], mate->readWithMoreHitsGenomeLocation, + scoreLocation(readWithMoreHits, setPairDirection[candidate->whichSetPair][readWithMoreHits], mate->readWithMoreHitsUnliftedGenomeLocation, mate->seedOffset, scoreLimit - fewerEndScore, &mate->score, &mate->matchProbability, &mate->genomeOffset); #ifdef _DEBUG if (_DumpAlignments) { - printf("Scored mate candidate %d, set pair %d, read %d, location %u, seed offset %d, score limit %d, score %d, offset %d\n", - (int)(mate - scoringMateCandidates[candidate->whichSetPair]), candidate->whichSetPair, readWithMoreHits, mate->readWithMoreHitsGenomeLocation, + printf("Scored mate candidate %d, set pair %d, read %d, location %u(%u), seed offset %d, score limit %d, score %d, offset %d\n", + (int)(mate - scoringMateCandidates[candidate->whichSetPair]), candidate->whichSetPair, readWithMoreHits, mate->readWithMoreHitsGenomeLocation, mate->readWithMoreHitsUnliftedGenomeLocation, mate->seedOffset, scoreLimit - fewerEndScore, mate->score, mate->genomeOffset); } #endif // _DEBUG @@ -654,11 +686,22 @@ IntersectingPairedEndAligner::align( if (mate->score != -1) { double pairProbability = mate->matchProbability * fewerEndMatchProbability; unsigned pairScore = mate->score + fewerEndScore; + + // reduce probability of pairs matching across different overlapping alts + // todo: assuming if they're on different alts within maxSpacing they overlap - true for GRCh38 but not necessarily for all genomes + // use crossover probability with 1 centiMorgan ~= 1Mbp - too strict? + if (doesGenomeIndexHaveAlts && candidate->isAlt() && mate->isAlt() && + DistanceBetweenGenomeLocations(mate->readWithMoreHitsUnliftedGenomeLocation, candidate->readWithFewerHitsUnliftedGenomeLocation) > 2*maxSpacing) + { + pairProbability *= 1e-8 * DistanceBetweenGenomeLocations(candidate->readWithFewerHitsGenomeLocation, mate->readWithMoreHitsGenomeLocation); + } + // // See if this should be ignored as a merge, or if we need to back out a previously scored location // because it's a worse version of this location. // MergeAnchor *mergeAnchor = candidate->mergeAnchor; + MergeAnchor *unliftedMergeAnchor = candidate->unliftedMergeAnchor; if (NULL == mergeAnchor) { // @@ -672,6 +715,7 @@ IntersectingPairedEndAligner::align( if (mergeCandidate->mergeAnchor != NULL) { candidate->mergeAnchor = mergeAnchor = mergeCandidate->mergeAnchor; + candidate->unliftedMergeAnchor = mergeAnchor = mergeCandidate->unliftedMergeAnchor; break; } } @@ -681,10 +725,11 @@ IntersectingPairedEndAligner::align( mergeCandidate < scoringCandidatePool + lowestFreeScoringCandidatePoolEntry && genomeLocationIsWithin(mergeCandidate->readWithFewerHitsGenomeLocation, candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset, 50) && mergeCandidate->whichSetPair == candidate->whichSetPair; - mergeCandidate--) { + mergeCandidate++) { if (mergeCandidate->mergeAnchor != NULL) { candidate->mergeAnchor = mergeAnchor = mergeCandidate->mergeAnchor; + candidate->unliftedMergeAnchor = unliftedMergeAnchor = mergeCandidate->unliftedMergeAnchor; break; } } @@ -692,11 +737,12 @@ IntersectingPairedEndAligner::align( } bool merged; + bool mergedUnlifted; double oldPairProbability; if (NULL == mergeAnchor) { - if (firstFreeMergeAnchor >= mergeAnchorPoolSize) { + if (firstFreeMergeAnchor >= mergeAnchorPoolSize - doesGenomeIndexHaveAlts) { WriteErrorMessage("Ran out of merge anchor pool entries. Perhaps rerunning with a larger value of -mcp will help\n"); soft_exit(1); } @@ -709,29 +755,45 @@ IntersectingPairedEndAligner::align( pairProbability, pairScore); merged = false; + mergedUnlifted = false; oldPairProbability = 0; candidate->mergeAnchor = mergeAnchor; + if (doesGenomeIndexHaveAlts) { + unliftedMergeAnchor = &mergeAnchorPool[firstFreeMergeAnchor]; + candidate->unliftedMergeAnchor = unliftedMergeAnchor; + firstFreeMergeAnchor++; + unliftedMergeAnchor->init(mate->readWithMoreHitsUnliftedGenomeLocation + mate->genomeOffset, candidate->readWithFewerHitsUnliftedGenomeLocation + fewerEndGenomeLocationOffset, + pairProbability, pairScore); + } } else { merged = mergeAnchor->checkMerge(mate->readWithMoreHitsGenomeLocation + mate->genomeOffset, candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset, - pairProbability, pairScore, &oldPairProbability); + pairProbability, pairScore, doesGenomeIndexHaveAlts && (! candidate->isAlt()) && (!mate->isAlt()), &oldPairProbability); + if (unliftedMergeAnchor != NULL) { + double ignore; + mergedUnlifted = merged && unliftedMergeAnchor->checkMerge(mate->readWithMoreHitsUnliftedGenomeLocation + mate->genomeOffset, candidate->readWithFewerHitsUnliftedGenomeLocation + fewerEndGenomeLocationOffset, + pairProbability, pairScore, false, &ignore); + } } - if (!merged) { + if (!(merged && mergedUnlifted)) { // // Back out the probability of the old match that we're merged with, if any. The max // is necessary because a + b - b is not necessarily a in floating point. If there // was no merge, the oldPairProbability is 0. // - probabilityOfAllPairs = __max(0, probabilityOfAllPairs - oldPairProbability); - + if (!merged) { + probabilityOfAllPairs = __max(0, probabilityOfAllPairs - oldPairProbability); + } bool isBestHit = false; if (pairScore <= maxK && (pairScore < bestPairScore || - (pairScore == bestPairScore && pairProbability > probabilityOfBestPair))) { + (pairScore == bestPairScore && (pairProbability >= probabilityOfBestPair*EPSILON_FACTOR_HI || + (bestPairHasAlts && pairProbability >= probabilityOfBestPair*EPSILON_FACTOR_LO && (!candidate->isAlt()) && (!mate->isAlt())))))) { // // A new best hit. // - if (maxEditDistanceForSecondaryResults != -1 && (unsigned)maxEditDistanceForSecondaryResults >= pairScore - bestPairScore) { + // Code review note: was pairScore-bestPairScore which is negative int, i.e. very large unsigned, so would only save secondary w/equal score + if (maxEditDistanceForSecondaryResults != -1 && (unsigned)maxEditDistanceForSecondaryResults >= bestPairScore - pairScore) { // // Move the old best to be a secondary alignment. This won't happen on the first time we get a valid alignment, // because bestPairScore is initialized to be very large. @@ -759,12 +821,13 @@ IntersectingPairedEndAligner::align( } bestPairScore = pairScore; probabilityOfBestPair = pairProbability; - bestResultGenomeLocation[readWithFewerHits] = candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset; - bestResultGenomeLocation[readWithMoreHits] = mate->readWithMoreHitsGenomeLocation + mate->genomeOffset; + bestResultGenomeLocation[readWithFewerHits] = candidate->readWithFewerHitsUnliftedGenomeLocation + fewerEndGenomeLocationOffset; + bestResultGenomeLocation[readWithMoreHits] = mate->readWithMoreHitsUnliftedGenomeLocation + mate->genomeOffset; bestResultScore[readWithFewerHits] = fewerEndScore; bestResultScore[readWithMoreHits] = mate->score; bestResultDirection[readWithFewerHits] = setPairDirection[candidate->whichSetPair][readWithFewerHits]; bestResultDirection[readWithMoreHits] = setPairDirection[candidate->whichSetPair][readWithMoreHits]; + bestPairHasAlts = candidate->isAlt() || mate->isAlt(); if (!noUkkonen) { scoreLimit = bestPairScore + extraSearchDepth; @@ -786,8 +849,8 @@ IntersectingPairedEndAligner::align( result->direction[readWithMoreHits] = setPairDirection[candidate->whichSetPair][readWithMoreHits]; result->direction[readWithFewerHits] = setPairDirection[candidate->whichSetPair][readWithFewerHits]; result->fromAlignTogether = true; - result->location[readWithMoreHits] = mate->readWithMoreHitsGenomeLocation + mate->genomeOffset; - result->location[readWithFewerHits] = candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset; + result->location[readWithMoreHits] = mate->readWithMoreHitsUnliftedGenomeLocation + mate->genomeOffset; + result->location[readWithFewerHits] = candidate->readWithFewerHitsUnliftedGenomeLocation + fewerEndGenomeLocationOffset; result->mapq[0] = result->mapq[1] = 0; result->score[readWithMoreHits] = mate->score; result->score[readWithFewerHits] = fewerEndScore; @@ -797,12 +860,15 @@ IntersectingPairedEndAligner::align( } } - probabilityOfAllPairs += pairProbability; + if (!merged) { + probabilityOfAllPairs += pairProbability; + } #ifdef _DEBUG if (_DumpAlignments) { - printf("Added %e (= %e * %e) @ (%u, %u), giving new probability of all pairs %e, score %d = %d + %d%s\n", + printf("Added %e (= %e * %e) @ (%u, %u)((%u, %u)), giving new probability of all pairs %e, score %d = %d + %d%s\n", pairProbability, mate->matchProbability , fewerEndMatchProbability, candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset, mate->readWithMoreHitsGenomeLocation + mate->genomeOffset, + candidate->readWithFewerHitsUnliftedGenomeLocation + fewerEndGenomeLocationOffset, mate->readWithMoreHitsUnliftedGenomeLocation+ mate->genomeOffset, probabilityOfAllPairs, pairScore, fewerEndScore, mate->score, isBestHit ? " New best hit" : ""); } @@ -1091,7 +1157,7 @@ IntersectingPairedEndAligner::HashTableHitSet::init() #define RL(lookups, glType, lookupListHead) \ void \ -IntersectingPairedEndAligner::HashTableHitSet::recordLookup(unsigned seedOffset, _int64 nHits, const glType *hits, bool beginsDisjointHitSet) \ +IntersectingPairedEndAligner::HashTableHitSet::recordLookup(unsigned seedOffset, _int64 nHits, const glType *hits, const glType *unliftedHits, bool beginsDisjointHitSet) \ { \ _ASSERT(nLookupsUsed < maxSeeds); \ if (beginsDisjointHitSet) { \ @@ -1106,6 +1172,7 @@ IntersectingPairedEndAligner::HashTableHitSet::recordLookup(unsigned seedOffset, _ASSERT(currentDisjointHitSet != -1); /* Essentially that beginsDisjointHitSet is set for the first recordLookup call */ \ lookups[nLookupsUsed].currentHitForIntersection = 0; \ lookups[nLookupsUsed].hits = hits; \ + lookups[nLookupsUsed].unliftedHits = unliftedHits; \ lookups[nLookupsUsed].nHits = nHits; \ lookups[nLookupsUsed].seedOffset = seedOffset; \ lookups[nLookupsUsed].whichDisjointHitSet = currentDisjointHitSet; \ @@ -1181,7 +1248,7 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren } bool -IntersectingPairedEndAligner::HashTableHitSet::getNextHitLessThanOrEqualTo(GenomeLocation maxGenomeLocationToFind, GenomeLocation *actualGenomeLocationFound, unsigned *seedOffsetFound) + IntersectingPairedEndAligner::HashTableHitSet::getNextHitLessThanOrEqualTo(GenomeLocation maxGenomeLocationToFind, GenomeLocation *actualGenomeLocationFound, unsigned *seedOffsetFound, GenomeLocation *actualUnliftedGenomeLocationFound) { bool anyFound = false; @@ -1235,10 +1302,21 @@ IntersectingPairedEndAligner::HashTableHitSet::getNextHitLessThanOrEqualTo(Genom unsigned clause2 = probe == 0; if (clause1 && (clause2 || probeMinusOneHit > maxGenomeLocationToFindThisSeed)) { - if (probeHit - seedOffset > bestLocationFound) { - anyFound = true; - mostRecentLocationReturned = *actualGenomeLocationFound = bestLocationFound = probeHit - seedOffset; - *seedOffsetFound = seedOffset; + if (actualUnliftedGenomeLocationFound == NULL) { + if (probeHit - seedOffset > bestLocationFound) { + anyFound = true; + mostRecentLocationReturned = *actualGenomeLocationFound = bestLocationFound = probeHit - seedOffset; + *seedOffsetFound = seedOffset; + } + } else { + GenomeLocation bestUnliftedLocationFound = doesGenomeIndexHave64BitLocations ? lookups64[i].unliftedHits[probe] : lookups32[i].unliftedHits[probe]; + if (probeHit - seedOffset > bestLocationFound || + (probeHit - seedOffset == bestLocationFound && *actualUnliftedGenomeLocationFound != bestUnliftedLocationFound)) { + anyFound = true; + mostRecentLocationReturned = *actualGenomeLocationFound = bestLocationFound = probeHit - seedOffset; + *actualUnliftedGenomeLocationFound = bestUnliftedLocationFound - seedOffset; + *seedOffsetFound = seedOffset; + } } if (doesGenomeIndexHave64BitLocations) { @@ -1273,7 +1351,7 @@ IntersectingPairedEndAligner::HashTableHitSet::getNextHitLessThanOrEqualTo(Genom bool -IntersectingPairedEndAligner::HashTableHitSet::getFirstHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound) + IntersectingPairedEndAligner::HashTableHitSet::getFirstHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound, GenomeLocation *unliftedGenomeLocation) { bool anyFound = false; *genomeLocation = 0; @@ -1286,6 +1364,9 @@ IntersectingPairedEndAligner::HashTableHitSet::getFirstHit(GenomeLocation *genom for (unsigned i = 0; i < nLookupsUsed; i++) { \ if (lookups[i].nHits > 0 && lookups[i].hits[0] - lookups[i].seedOffset > GenomeLocationAsInt64(*genomeLocation)) { \ mostRecentLocationReturned = *genomeLocation = lookups[i].hits[0] - lookups[i].seedOffset; \ + if (unliftedGenomeLocation != NULL) { \ + *unliftedGenomeLocation = lookups[i].unliftedHits[0] - lookups[i].seedOffset; \ + } \ *seedOffsetFound = lookups[i].seedOffset; \ anyFound = true; \ } \ @@ -1303,7 +1384,8 @@ IntersectingPairedEndAligner::HashTableHitSet::getFirstHit(GenomeLocation *genom } bool -IntersectingPairedEndAligner::HashTableHitSet::getNextLowerHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound) +IntersectingPairedEndAligner::HashTableHitSet::getNextLowerHit( + GenomeLocation *genomeLocation, unsigned *seedOffsetFound, GenomeLocation *unliftedGenomeLocation) { // // Look through all of the lookups and find the one with the highest location smaller than the current one. @@ -1318,7 +1400,8 @@ IntersectingPairedEndAligner::HashTableHitSet::getNextLowerHit(GenomeLocation *g for (unsigned i = 0; i < nLookupsUsed; i++) { _int64 *currentHitForIntersection; _int64 nHits; - GenomeLocation hitLocation; + GenomeLocation hitLocation = -1; + GenomeLocation unliftedHitLocation = -2; unsigned seedOffset; // @@ -1330,6 +1413,9 @@ IntersectingPairedEndAligner::HashTableHitSet::getNextLowerHit(GenomeLocation *g seedOffset = lookups[i].seedOffset; \ if (nHits != *currentHitForIntersection) { \ hitLocation = lookups[i].hits[*currentHitForIntersection]; \ + if (unliftedGenomeLocation != NULL) { \ + unliftedHitLocation = lookups[i].unliftedHits[*currentHitForIntersection]; \ + } \ } @@ -1349,8 +1435,14 @@ IntersectingPairedEndAligner::HashTableHitSet::getNextLowerHit(GenomeLocation *g } if (doesGenomeIndexHave64BitLocations) { hitLocation = lookups64[i].hits[*currentHitForIntersection]; + if (unliftedGenomeLocation != NULL) { + unliftedHitLocation = lookups64[i].unliftedHits[*currentHitForIntersection]; + } } else { hitLocation = lookups32[i].hits[*currentHitForIntersection]; + if (unliftedGenomeLocation != NULL) { + unliftedHitLocation = lookups32[i].unliftedHits[*currentHitForIntersection]; + } } } @@ -1359,6 +1451,9 @@ IntersectingPairedEndAligner::HashTableHitSet::getNextLowerHit(GenomeLocation *g hitLocation >= seedOffset) // found location isn't too small to push us before the beginning of the genome { *genomeLocation = foundLocation = hitLocation - seedOffset; + if (unliftedGenomeLocation != NULL) { + *unliftedGenomeLocation = unliftedHitLocation - seedOffset; + } *seedOffsetFound = seedOffset; anyFound = true; } @@ -1373,7 +1468,7 @@ IntersectingPairedEndAligner::HashTableHitSet::getNextLowerHit(GenomeLocation *g } bool -IntersectingPairedEndAligner::MergeAnchor::checkMerge(GenomeLocation newMoreHitLocation, GenomeLocation newFewerHitLocation, double newMatchProbability, int newPairScore, +IntersectingPairedEndAligner::MergeAnchor::checkMerge(GenomeLocation newMoreHitLocation, GenomeLocation newFewerHitLocation, double newMatchProbability, int newPairScore, bool newPairIsNonAlt, double *oldMatchProbability) { if (locationForReadWithMoreHits == InvalidGenomeLocation || !doesRangeMatch(newMoreHitLocation, newFewerHitLocation)) { @@ -1385,12 +1480,20 @@ IntersectingPairedEndAligner::MergeAnchor::checkMerge(GenomeLocation newMoreHitL matchProbability = newMatchProbability; pairScore = newPairScore; *oldMatchProbability = 0.0; +#ifdef _DEBUG + if (_DumpAlignments) { + printf("New anchor loc (%u, %u)\n", newMoreHitLocation, newFewerHitLocation); + } +#endif return false; } else { // // Within merge distance. Keep the better score (or if they're tied the better match probability). // - if (newPairScore < pairScore || newPairScore == pairScore && newMatchProbability > matchProbability) { + if (newPairScore < pairScore || (newPairScore == pairScore && + (newMatchProbability >= matchProbability*EPSILON_FACTOR_HI || + (newMatchProbability >= matchProbability*EPSILON_FACTOR_LO && newPairIsNonAlt && + (newMoreHitLocation != locationForReadWithMoreHits || newFewerHitLocation != locationForReadWithFewerHits))))) { #ifdef _DEBUG if (_DumpAlignments) { printf("Merge replacement at anchor (%u, %u), loc (%u, %u), old match prob %e, new match prob %e, old pair score %d, new pair score %d\n", diff --git a/SNAPLib/IntersectingPairedEndAligner.h b/SNAPLib/IntersectingPairedEndAligner.h index 9bc6029e..3884892d 100644 --- a/SNAPLib/IntersectingPairedEndAligner.h +++ b/SNAPLib/IntersectingPairedEndAligner.h @@ -134,6 +134,7 @@ class IntersectingPairedEndAligner : public PairedEndAligner unsigned maxSpacing; unsigned seedLen; bool doesGenomeIndexHave64BitLocations; + bool doesGenomeIndexHaveAlts; _int64 nLocationsScored; bool noUkkonen; bool noOrderedEvaluation; @@ -149,6 +150,7 @@ class IntersectingPairedEndAligner : public PairedEndAligner unsigned seedOffset; _int64 nHits; const GL * hits; + const GL * unliftedHits; unsigned whichDisjointHitSet; // @@ -185,7 +187,8 @@ class IntersectingPairedEndAligner : public PairedEndAligner // provide the lookup function a place to write the result. Since we need one per // lookup, it goes here. // - GL singletonGenomeLocation[2]; // The [2] is because we need to look one before sometimes, and that allows space + GL singletonGenomeLocation[4]; // The [4] is because we need to look one before sometimes, and that allows space + // also to allow space for unlifted locations }; // @@ -211,26 +214,25 @@ class IntersectingPairedEndAligner : public PairedEndAligner // seed for it not to hit, and since the reads are disjoint there can't be a case // where the same difference caused two seeds to miss). // - void recordLookup(unsigned seedOffset, _int64 nHits, const unsigned *hits, bool beginsDisjointHitSet); - void recordLookup(unsigned seedOffset, _int64 nHits, const GenomeLocation *hits, bool beginsDisjointHitSet); + void recordLookup(unsigned seedOffset, _int64 nHits, const unsigned *hits, const unsigned *unliftedHits, bool beginsDisjointHitSet); + void recordLookup(unsigned seedOffset, _int64 nHits, const GenomeLocation *hits, const GenomeLocation *unliftedHits, bool beginsDisjointHitSet); // // This efficiently works through the set looking for the next hit at or below this address. // A HashTableHitSet only allows a single iteration through its address space per call to // init(). // - bool getNextHitLessThanOrEqualTo(GenomeLocation maxGenomeLocationToFind, GenomeLocation *actualGenomeLocationFound, unsigned *seedOffsetFound); + bool getNextHitLessThanOrEqualTo(GenomeLocation maxGenomeLocationToFind, GenomeLocation *actualGenomeLocationFound, unsigned *seedOffsetFound, GenomeLocation *actualUnliftedGenomeLocationFound); // // Walk down just one step, don't binary search. // - bool getNextLowerHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound); - + bool getNextLowerHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound, GenomeLocation *unliftedGenomeLocation); // // Find the highest genome address. // - bool getFirstHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound); + bool getFirstHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound, GenomeLocation *unliftedGenomeLocation); unsigned computeBestPossibleScoreForCurrentHit(); @@ -377,7 +379,7 @@ class IntersectingPairedEndAligner : public PairedEndAligner // // Returns true and sets oldMatchProbability if this should be eliminated due to a match. // - bool checkMerge(GenomeLocation newMoreHitLocation, GenomeLocation newFewerHitLocation, double newMatchProbability, int newPairScore, + bool checkMerge(GenomeLocation newMoreHitLocation, GenomeLocation newFewerHitLocation, double newMatchProbability, int newPairScore, bool newPairIsNonAlt, double *oldMatchProbability); }; @@ -394,14 +396,17 @@ class IntersectingPairedEndAligner : public PairedEndAligner // double matchProbability; GenomeLocation readWithMoreHitsGenomeLocation; + GenomeLocation readWithMoreHitsUnliftedGenomeLocation; unsigned bestPossibleScore; unsigned score; unsigned scoreLimit; // The scoreLimit with which score was computed unsigned seedOffset; int genomeOffset; - void init(GenomeLocation readWithMoreHitsGenomeLocation_, unsigned bestPossibleScore_, unsigned seedOffset_) { + void init(GenomeLocation readWithMoreHitsGenomeLocation_, unsigned bestPossibleScore_, unsigned seedOffset_, GenomeLocation readWithMoreHitsUnliftedGenomeLocation_) { readWithMoreHitsGenomeLocation = readWithMoreHitsGenomeLocation_; + readWithMoreHitsUnliftedGenomeLocation = readWithMoreHitsUnliftedGenomeLocation_; + _ASSERT(readWithMoreHitsUnliftedGenomeLocation != -1); bestPossibleScore = bestPossibleScore_; seedOffset = seedOffset_; score = -2; @@ -409,22 +414,26 @@ class IntersectingPairedEndAligner : public PairedEndAligner matchProbability = 0; genomeOffset = 0; } + bool isAlt() const { return readWithMoreHitsGenomeLocation != readWithMoreHitsUnliftedGenomeLocation; } }; struct ScoringCandidate { ScoringCandidate * scoreListNext; // This is a singly-linked list MergeAnchor * mergeAnchor; + MergeAnchor * unliftedMergeAnchor; unsigned scoringMateCandidateIndex; // Index into the array of scoring mate candidates where we should look GenomeLocation readWithFewerHitsGenomeLocation; + GenomeLocation readWithFewerHitsUnliftedGenomeLocation; unsigned whichSetPair; unsigned seedOffset; unsigned bestPossibleScore; void init(GenomeLocation readWithFewerHitsGenomeLocation_, unsigned whichSetPair_, unsigned scoringMateCandidateIndex_, unsigned seedOffset_, - unsigned bestPossibleScore_, ScoringCandidate *scoreListNext_) + unsigned bestPossibleScore_, ScoringCandidate *scoreListNext_, GenomeLocation readWithFewerHitsUnliftedGenomeLocation_) { readWithFewerHitsGenomeLocation = readWithFewerHitsGenomeLocation_; + readWithFewerHitsUnliftedGenomeLocation = readWithFewerHitsUnliftedGenomeLocation_; whichSetPair = whichSetPair_; _ASSERT(whichSetPair < NUM_SET_PAIRS); // You wouldn't think this would be necessary, but... scoringMateCandidateIndex = scoringMateCandidateIndex_; @@ -432,7 +441,9 @@ class IntersectingPairedEndAligner : public PairedEndAligner bestPossibleScore = bestPossibleScore_; scoreListNext = scoreListNext_; mergeAnchor = NULL; + unliftedMergeAnchor = NULL; } + bool isAlt() const { return readWithFewerHitsGenomeLocation != readWithFewerHitsUnliftedGenomeLocation; } }; // diff --git a/SNAPLib/SAM.cpp b/SNAPLib/SAM.cpp index 19f32fcb..0317f139 100644 --- a/SNAPLib/SAM.cpp +++ b/SNAPLib/SAM.cpp @@ -1027,7 +1027,6 @@ SAMFormat::createSAMLine( { contigName = "*"; positionInContig = 0; - const char *cigar = "*"; templateLength = 0; if (secondaryAlignment) { @@ -1063,23 +1062,8 @@ SAMFormat::createSAMLine( return false; } - if (direction == RC) { - for (unsigned i = 0; i < fullLength; i++) { - data[fullLength - 1 - i] = COMPLEMENT[read->getUnclippedData()[i]]; - quality[fullLength - 1 - i] = read->getUnclippedQuality()[i]; - } - clippedData = &data[fullLength - clippedLength - read->getFrontClippedLength()]; - basesClippedBefore = fullLength - clippedLength - read->getFrontClippedLength(); - basesClippedAfter = read->getFrontClippedLength(); - } else { - memcpy(data, read->getUnclippedData(), read->getUnclippedLength()); - memcpy(quality, read->getUnclippedQuality(), read->getUnclippedLength()); - clippedData = read->getData(); - basesClippedBefore = read->getFrontClippedLength(); - basesClippedAfter = fullLength - clippedLength - basesClippedBefore; - } - int editDistance = -1; + const Genome::Contig* contig = NULL; if (genomeLocation != InvalidGenomeLocation) { if (direction == RC) { flags |= SAM_REVERSE_COMPLEMENT; @@ -1092,12 +1076,36 @@ SAMFormat::createSAMLine( contigIndex = (int)(contig - genome->getContigs()); positionInContig = genomeLocation - contig->beginningLocation + 1; // SAM is 1-based mapQuality = max(0, min(70, mapQuality)); // FIXME: manifest constant. + + if (contig->isAlternateRC) { + // contig was reverse-complemented when building index + flags ^= SAM_REVERSE_COMPLEMENT; + positionInContig = 1 + max(0L, (contig->length - genome->getChromosomePadding() - positionInContig + 1) - (_int64)fullLength); + direction = direction == RC ? FORWARD : RC; + } } else { flags |= SAM_UNMAPPED; mapQuality = 0; *extraBasesClippedBefore = 0; } + if (direction == RC) { + for (unsigned i = 0; i < fullLength; i++) { + data[fullLength - 1 - i] = COMPLEMENT[read->getUnclippedData()[i]]; + quality[fullLength - 1 - i] = read->getUnclippedQuality()[i]; + } + clippedData = &data[fullLength - clippedLength - read->getFrontClippedLength()]; + basesClippedBefore = fullLength - clippedLength - read->getFrontClippedLength(); + basesClippedAfter = read->getFrontClippedLength(); + } else { + memcpy(data, read->getUnclippedData(), read->getUnclippedLength()); + memcpy(quality, read->getUnclippedQuality(), read->getUnclippedLength()); + clippedData = read->getData(); + basesClippedBefore = read->getFrontClippedLength(); + basesClippedAfter = fullLength - clippedLength - basesClippedBefore; + } + + if (hasMate) { flags |= SAM_MULTI_SEGMENT; flags |= (firstInPair ? SAM_FIRST_SEGMENT : SAM_LAST_SEGMENT); @@ -1112,6 +1120,11 @@ SAMFormat::createSAMLine( if (mateDirection == RC) { flags |= SAM_NEXT_REVERSED; } + if (mateContig->isAlternateRC) { + // mate contig was reverse-complemented when building index + flags ^= SAM_NEXT_REVERSED; + matePositionInContig = 1 + max(0L, (mateContig->length - genome->getChromosomePadding() - matePositionInContig + 1) - (_int64)fullLength); + } if (genomeLocation == InvalidGenomeLocation) { // @@ -1138,16 +1151,17 @@ SAMFormat::createSAMLine( if (alignedAsPair) { flags |= SAM_ALL_ALIGNED; } - // Also compute the length of the whole paired-end string whose ends we saw. This is slightly - // tricky because (a) we may have clipped some bases before/after each end and (b) we need to - // give a signed result based on whether our read is first or second in the pair. - GenomeLocation myStart = genomeLocation - basesClippedBefore; - GenomeLocation myEnd = genomeLocation + clippedLength + basesClippedAfter; - _int64 mateBasesClippedBefore = mate->getFrontClippedLength(); - _int64 mateBasesClippedAfter = mate->getUnclippedLength() - mate->getDataLength() - mateBasesClippedBefore; - GenomeLocation mateStart = mateLocation - (mateDirection == RC ? mateBasesClippedAfter : mateBasesClippedBefore); - GenomeLocation mateEnd = mateLocation + mate->getDataLength() + (mateDirection == FORWARD ? mateBasesClippedAfter : mateBasesClippedBefore); - if (contigName == matecontigName) { // pointer (not value) comparison, but that's OK. + // todo: should this look at lifted locations for alt contigs that map to same non-alt contig? + if (contigIndex == mateContigIndex) { + // Also compute the length of the whole paired-end string whose ends we saw. This is slightly + // tricky because (a) we may have clipped some bases before/after each end and (b) we need to + // give a signed result based on whether our read is first or second in the pair. + GenomeDistance myStart = positionInContig - basesClippedBefore; + GenomeDistance myEnd = positionInContig + clippedLength + basesClippedAfter; + _int64 mateBasesClippedBefore = mate->getFrontClippedLength(); + _int64 mateBasesClippedAfter = mate->getUnclippedLength() - mate->getDataLength() - mateBasesClippedBefore; + GenomeDistance mateStart = matePositionInContig - (mateDirection == RC ? mateBasesClippedAfter : mateBasesClippedBefore); + GenomeDistance mateEnd = matePositionInContig + mate->getDataLength() + (mateDirection == FORWARD ? mateBasesClippedAfter : mateBasesClippedBefore); if (myStart < mateStart) { templateLength = mateEnd - myStart; } else { @@ -1228,13 +1242,13 @@ SAMFormat::writeRead( } if (genomeLocation != InvalidGenomeLocation) { - cigar = computeCigarString(context.genome, lv, cigarBuf, cigarBufSize, cigarBufWithClipping, cigarBufWithClippingSize, - clippedData, clippedLength, basesClippedBefore, extraBasesClippedBefore, basesClippedAfter, - read->getOriginalFrontHardClipping(), read->getOriginalBackHardClipping(), genomeLocation, direction, useM, - &editDistance, o_addFrontClipping); - if (*o_addFrontClipping != 0) { - return false; - } + cigar = computeCigarString(context.genome, lv, cigarBuf, cigarBufSize, cigarBufWithClipping, cigarBufWithClippingSize, + clippedData, clippedLength, basesClippedBefore, extraBasesClippedBefore, basesClippedAfter, + read->getOriginalFrontHardClipping(), read->getOriginalBackHardClipping(), genomeLocation, direction, useM, + &editDistance, o_addFrontClipping); + if (*o_addFrontClipping != 0) { + return false; + } } @@ -1371,6 +1385,16 @@ SAMFormat::computeCigar( const Genome::Contig *contig = genome->getContigAtLocation(genomeLocation); + const char *reference = genome->getSubstring(genomeLocation, dataLength); + if (contig->isAlternateRC) { + // the original reference was reverse-complemented on index build to simplify alignment + // so reverse-complement reference for CIGAR string + // data was already flipped in createSAMLine if needed + char* referenceBuf = (char*)alloca(dataLength + MAX_K); + util::toComplement(referenceBuf, reference - MAX_K, (int)dataLength + MAX_K); + reference = referenceBuf; + } + if (genomeLocation + dataLength > contig->beginningLocation + contig->length - genome->getChromosomePadding()) { // // The read hangs off the end of the contig. Soft clip it at the end. This is a tentative amount that assumes no net indels in the @@ -1381,7 +1405,6 @@ SAMFormat::computeCigar( *o_extraBasesClippedAfter = 0; } - const char *reference = genome->getSubstring(genomeLocation, dataLength); if (NULL == reference) { // // Fell off the end of the contig. @@ -1393,6 +1416,7 @@ SAMFormat::computeCigar( return; } + *o_editDistance = lv->computeEditDistanceNormalized( reference, (int)(dataLength - *o_extraBasesClippedAfter + MAX_K), // Add space incase of indels. We know there's enough, because the reference is padded. @@ -1566,6 +1590,14 @@ SAMFormat::validateCigarString( WriteErrorMessage("validateCigarString: read alignment location isn't in a chromosome, genomeLocation %lld\n", GenomeLocationAsInt64(genomeLocation)); soft_exit(1); } + if (contig->isAlternateRC) { + // the original reference was reverse-complemented on index build to simplify alignment + // so reverse-complement reference for CIGAR string + // data was already flipped in createSAMLine if needed + char* referenceBuf = (char*)alloca(dataLength + MAX_K); + util::toComplement(referenceBuf, reference - MAX_K, (int)dataLength + MAX_K); + reference = referenceBuf; + } if (genomeLocation >= contig->beginningLocation + contig->length - genome->getChromosomePadding()) { WriteErrorMessage("validateCigarString: alignment location is in genome padding: %lld, contig name %s, base %lld, len %lld, padding size %d\n", diff --git a/SNAPLib/SortedDataWriter.cpp b/SNAPLib/SortedDataWriter.cpp index 85ad336b..2d8970fd 100644 --- a/SNAPLib/SortedDataWriter.cpp +++ b/SNAPLib/SortedDataWriter.cpp @@ -184,6 +184,16 @@ SortedDataFilter::onAdvance( GenomeDistance bytes, GenomeLocation location) { + if (location != InvalidGenomeLocation && parent->genome->hasAltContigs()) { + // reads mapped to RC alt contigs need to have location flipped so they sort properly + const Genome::Contig* c = parent->genome->getContigAtLocation(location); + if (c != NULL && c->isAlternateRC) { + GenomeLocation rcLocation; + GenomeDistance ignore; + parent->format->getSortInfo(parent->genome, data, bytes, &rcLocation, &ignore); + location = rcLocation; + } + } SortEntry entry(batchOffset, bytes, location); #ifdef VALIDATE_SORT if (memcmp(data, "BAM", 3) != 0 && memcmp(data, "@HD", 3) != 0) { // skip header block diff --git a/SNAPLib/VariableSizeVector.h b/SNAPLib/VariableSizeVector.h index e32d3dc2..f1b21bc7 100644 --- a/SNAPLib/VariableSizeVector.h +++ b/SNAPLib/VariableSizeVector.h @@ -4,6 +4,7 @@ // // A variable-size vector that does not perform any memory allocation except to grow. +// if grow==0 then it must be supplied with a vector to start and it will NOT grow or deallocate // template<typename V, int grow = 150, bool big = false> class VariableSizeVector @@ -17,31 +18,43 @@ class VariableSizeVector WriteErrorMessage("%s: allocate %lld - consider using BigAlloc\n", __FUNCTION__, bytes); } #endif + _ASSERT(grow != 0); return big ? BigAlloc(bytes) : malloc(bytes); } inline static void deallocate(void* p) { - if (big) { BigDealloc(p); } else { free(p); } + if (grow != 0) { + if (big) { BigDealloc(p); } else { free(p); } + } } public: + VariableSizeVector(int i_capacity = 16) : entries(NULL), count(0), capacity(i_capacity) - {} + { _ASSERT(grow != 0); } - VariableSizeVector(VariableSizeVector& other) + VariableSizeVector(V* i_entries, int i_capacity) + : entries(i_entries), capacity(i_capacity), count(0) + { _ASSERT(grow == 0); } + + VariableSizeVector(VariableSizeVector<V,grow,big>& other) : entries(other.entries), count(other.count), capacity(other.capacity) { other.count = 0; + other.capacity = 0; other.entries = NULL; } ~VariableSizeVector() { if (entries != NULL) { - deallocate(entries); + if (grow != 0) { + deallocate(entries); + } entries = NULL; + capacity = 0; count = 0; } } @@ -57,7 +70,7 @@ class VariableSizeVector } public: - void operator=(VariableSizeVector<V>& other) + void operator=(VariableSizeVector<V,grow,big>& other) { entries = other.entries; capacity = other.capacity; @@ -68,6 +81,13 @@ class VariableSizeVector void reserve(_int64 newCapacity) { + if (grow == 0) { + if (newCapacity <= capacity) { + return; + } + WriteErrorMessage("Unable to grow fixed VariableSizeVector from %ld to %ld\n", capacity, newCapacity); + soft_exit(1); + } _ASSERT(newCapacity >= 0); if (newCapacity <= capacity && entries != NULL) { return; @@ -89,8 +109,10 @@ class VariableSizeVector inline void clean() { if (entries != NULL) { - deallocate(entries); - entries = NULL; + if (grow != 0) { + deallocate(entries); + entries = NULL; + } count = 0; } } @@ -109,11 +131,7 @@ class VariableSizeVector inline void push_back(V& value) { - if (entries == NULL) { - reserve(capacity); - } else if (count == capacity) { - reserve((int) (((_int64) count * grow) / 100)); - } + increase(); _ASSERT(count < capacity); entries[count++] = value; } diff --git a/tests/alttestgen.py b/tests/alttestgen.py new file mode 100644 index 00000000..f92bc700 --- /dev/null +++ b/tests/alttestgen.py @@ -0,0 +1,146 @@ +# alttestgen.py +# +# generate data files for alt-contig test +# +# test is alttest.py +# + +import sys +import os +import shutil +import subprocess +import random + +BASES = "ACTG" +RCBASES = {"A":"T", "T":"A", "C":"G", "G":"C"} + +def random_bases(n): + result = "" + for i in range(n): + result = result + random.choice(BASES) + return result + +def random_mutate(seq, p = 0.02): + for i in range(len(seq)): + if random.random() <= p: + b = "ACTG".find(seq[i:i+1]) + seq = seq[:i] + random.choice(BASES[:b] + BASES[b+1:]) + seq[i + 1:] + return seq + +def rc(seq): + result = "" + for c in seq: + result = RCBASES[c] + result + return result + +class Read: + def __init__(self, id, chr, pos, seq, qual=None): + self.id = id + self.chr = chr + self.pos = pos + self.seq = seq + self.qual = qual + + def __str__(self): + return "Read({}, {}, {}, {})".format(self.id, self.chr, self.pos, self.seq) + +class Contig: + def __init__(self, name, accession, seq, isAlt=False, parent=None, parentLoc = 0, isAltRC=False): + self.name = name + self.accession = accession + self.seq = seq + self.isAlt = isAlt + self.parent = parent + self.parentLoc = parentLoc + self.isAltRC = isAltRC + + def __str__(self): + return "Contig({}, {}, {}, {}, {}, {}, {})".format( + self.name, self.accession, self.seq, 'alt' if self.isAlt else 'ref', + self.parent, self.parentLoc, 'rc' if self.isAltRC else '') + +class Genome: + def __init__(self, contigs={}): + self.contigs = contigs + + def add(self, contig): + self.contigs[contig.name] = contig + + def add_alt(self, name, accession, parent, start, stop, isRC=False, pmut=0.02): + pc = self.contigs[parent] + altseq = random_mutate(pc.seq[start:stop], pmut) + if (isRC): + altseq = rc(altseq) + self.add(Contig(name, accession, altseq, True, parent, start, isRC)) + + def make_read(self, chr, pos, isReverse=False, len=100, pmut=.02, id=None): + if id == None: + id = "r{:05d}_{}_{}_{}".format(random.randint(0,99999), chr, pos+1, ('r' if isReverse else 'f')) + seq = random_mutate(self.contigs[chr].seq[pos:pos + len], pmut) + return Read(id, chr, pos, seq) + + def make_pair(self, chr1, pos1, chr2, pos2, len=100, pmut=.02): + id = "r{:05d}_{}_{}_{}_{}".format(random.randint(0,99999), chr1, pos1 + 1, chr2, pos2 + 1) + r1 = self.make_read(chr1, pos1, False, len, pmut, id + "/1") + r2 = self.make_read(chr2, pos2, True, len, pmut, id + "/2") + return [r1, r2] + + def to_sam_pair(self, read1, read2): + rc1 = 1 if self.contigs[read1.chr].isAltRC else 0 + rc2 = 0 if self.contigs[read2.chr].isAltRC else 1 + r1 = "{}\t{}\t{}\t{}\t{}\t{}M\t{}\t{}\t{}\t{}\t{}\n".format( + read1.id, 67+16*rc1+32*rc2, read1.chr, read1.pos + 1, 60, len(read1.seq), read2.chr, + read2.pos + 1, abs(read1.pos - read2.pos + len(read2.seq)), read1.seq, (['ABCD','DCBA'][rc1]*int(len(read1.seq)/4+1))[:len(read1.seq)]) + return r1 + "{}\t{}\t{}\t{}\t{}\t{}M\t{}\t{}\t{}\t{}\t{}\n".format( + read2.id, 131+16*rc2+32*rc1, read2.chr, read2.pos + 1, 60, len(read2.seq), read1.chr, + read1.pos + 1, abs(read1.pos - read2.pos + len(read2.seq)), read2.seq, (['ABCD','DCBA'][rc2]*int(len(read2.seq)/4+1))[:len(read2.seq)]) + + def write_fasta(self, filename): + with open(filename, 'w') as file: + for write_alts in [False, True]: + cnames = self.contigs.keys() + cnames.sort() + for cname in cnames: + contig = self.contigs[cname] + if contig.isAlt == write_alts: + file.write(">{}|gb|{}\n".format(contig.name, contig.accession)) + LINE_LEN=100 + for i in range(0, len(contig.seq), LINE_LEN): + file.write("{}\n".format(contig.seq[i:i+LINE_LEN])) + + def write_alts(self, filename): + with open(filename, 'w') as file: + file.write("#alt_scaf_acc\tparent_acc\tori\talt_scaf_start\talt_scaf_stop\tparent_start\tparent_stop\talt_start_tail\talt_stop_tail\n") + for contig in self.contigs.values(): + if contig.isAlt: + file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( + contig.accession, self.contigs[contig.parent].accession, '-' if contig.isAltRC else '+', + 1, len(contig.seq), 1 + contig.parentLoc, contig.parentLoc + len(contig.seq), 0, 0)) + +g = Genome() +seq = random_bases(7000) +seq = seq + random_mutate(seq[5000:6000]) + random_bases(1000) + random_mutate(seq[5000:6000]) + random_bases(1000) +g.add(Contig("chr1", "C01", seq)) +g.add_alt("chr1a", "C01A", "chr1", 1000, 2000) +g.add_alt("chr1b", "C01B", "chr1", 3000, 4000, True) +g.add_alt("chr1c", "C01C", "chr1", 5000, 6000) +g.add_alt("chr1d", "C01D", "chr1", 7000, 8000, True) +g.add_alt("chr1e", "C01E", "chr1", 9000, 10000) +g.add_alt("chr1f", "C01F", "chr1", 9000, 10000) +g.write_fasta("test.fa") +g.write_alts("test_alts.txt") + +with open("test.sam", "w") as file: + for i in [0, 100, 600, 800, 900]: + for j in range(5): + start = j * 2000 + chralt = ['chr1a', 'chr1b', 'chr1c', 'chr1d', 'chr1e'][j] + ialt = 900 - i if g.contigs[chralt].isAltRC else i + [r1, r2] = g.make_pair('chr1', i + start, chralt, ialt) + file.write(g.to_sam_pair(r1,r2)) + [r1, r2] = g.make_pair('chr1', i + start, 'chr1', i + start + 1000) + file.write(g.to_sam_pair(r1,r2)) + [r1, r2] = g.make_pair(chralt, ialt, 'chr1', i + start + 2000) + file.write(g.to_sam_pair(r1,r2)) + [r1, r2] = g.make_pair('chr1', i + start + 1000, 'chr1', i + start + 2000) + file.write(g.to_sam_pair(r1,r2))