BamTools 2.5.2
Loading...
Searching...
No Matches
BamAlignment.h
Go to the documentation of this file.
1// ***************************************************************************
2// BamAlignment.h (c) 2009 Derek Barnett
3// Marth Lab, Department of Biology, Boston College
4// ---------------------------------------------------------------------------
5// Last modified: 25 July 2013 (DB)
6// ---------------------------------------------------------------------------
7// Provides the BamAlignment data structure
8// ***************************************************************************
9
10#ifndef BAMALIGNMENT_H
11#define BAMALIGNMENT_H
12
13#include <cstddef>
14#include <cstdlib>
15#include <cstring>
16#include <string>
17#include <vector>
18#include "api/BamAux.h"
19#include "api/BamConstants.h"
20#include "api/api_global.h"
21
22namespace BamTools {
23
25// forward declaration of BamAlignment's "friends"
26namespace Internal {
27class BamReaderPrivate;
28class BamWriterPrivate;
29} // namespace Internal
31
32// BamAlignment data structure
33class API_EXPORT BamAlignment
34{
35
36 // constructors & destructor
37public:
39
40 // queries against alignment flags
41public:
42 bool IsDuplicate() const; // returns true if this read is a PCR duplicate
43 bool IsFailedQC() const; // returns true if this read failed quality control
44 bool IsFirstMate() const; // returns true if alignment is first mate on read
45 bool IsMapped() const; // returns true if alignment is mapped
46 bool IsMateMapped() const; // returns true if alignment's mate is mapped
47 bool IsMateReverseStrand() const; // returns true if alignment's mate mapped to reverse strand
48 bool IsPaired() const; // returns true if alignment part of paired-end read
49 bool IsSupplementary() const; // returns true if this read is supplementary
50 bool IsPrimaryAlignment() const; // returns true if reported position is primary alignment
51 bool IsProperPair()
52 const; // returns true if alignment is part of read that satisfied paired-end resolution
53 bool IsReverseStrand() const; // returns true if alignment mapped to reverse strand
54 bool IsSecondMate() const; // returns true if alignment is second mate on read
55
56 // manipulate alignment flags
57public:
58 void SetIsDuplicate(bool ok); // sets value of "PCR duplicate" flag
59 void SetIsFailedQC(bool ok); // sets value of "failed quality control" flag
60 void SetIsFirstMate(bool ok); // sets value of "alignment is first mate" flag
61 void SetIsMapped(bool ok); // sets value of "alignment is mapped" flag
62 void SetIsMateMapped(bool ok); // sets value of "alignment's mate is mapped" flag
63 void SetIsMateReverseStrand(
64 bool ok); // sets value of "alignment's mate mapped to reverse strand" flag
65 void SetIsPaired(bool ok); // sets value of "alignment part of paired-end read" flag
66 void SetIsPrimaryAlignment(bool ok); // sets value of "position is primary alignment" flag
67 void SetIsProperPair(
68 bool
69 ok); // sets value of "alignment is part of read that satisfied paired-end resolution" flag
70 void SetIsReverseStrand(bool ok); // sets value of "alignment mapped to reverse strand" flag
71 void SetIsSecondMate(bool ok); // sets value of "alignment is second mate on read" flag
72
73 // tag data access methods
74public:
75 // add a new tag
76 template <typename T>
77 bool AddTag(const std::string& tag, const std::string& type, const T& value);
78 template <typename T>
79 bool AddTag(const std::string& tag, const std::vector<T>& values);
80
81 // edit (or append) tag
82 template <typename T>
83 bool EditTag(const std::string& tag, const std::string& type, const T& value);
84 template <typename T>
85 bool EditTag(const std::string& tag, const std::vector<T>& values);
86
87 // retrieves tag data
88 template <typename T>
89 bool GetTag(const std::string& tag, T& destination) const;
90 template <typename T>
91 bool GetTag(const std::string& tag, std::vector<T>& destination) const;
92
93 // retrieves all current tag names
94 std::vector<std::string> GetTagNames() const;
95
96 // retrieves the SAM/BAM type-code for requested tag name
97 bool GetTagType(const std::string& tag, char& type) const;
98
99 // retrieves the SAM/BAM type-code for the data elements in an array tag
100 bool GetArrayTagType(const std::string& tag, char& type) const;
101
102 // returns true if alignment has a record for this tag name
103 bool HasTag(const std::string& tag) const;
104
105 // removes a tag
106 void RemoveTag(const std::string& tag);
107
108 // additional methods
109public:
110 // populates alignment string fields
111 bool BuildCharData();
112
113 // calculates alignment end position
114 int GetEndPosition(bool usePadded = false, bool closedInterval = false) const;
115
116 // returns a description of the last error that occurred
117 std::string GetErrorString() const;
118
119 // retrieves the size, read locations and reference locations of soft-clip operations
120 bool GetSoftClips(std::vector<int>& clipSizes, std::vector<int>& readPositions,
121 std::vector<int>& genomePositions, bool usePadded = false) const;
122
123 // public data fields
124public:
125 std::string Name; // read name
126 int32_t Length; // length of query sequence
127 std::string QueryBases; // 'original' sequence (contained in BAM file)
128 std::string
129 AlignedBases; // 'aligned' sequence (QueryBases plus deletion, padding, clipping chars)
130 std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
131 std::string TagData; // tag data (use provided methods to query/modify)
132 int32_t RefID; // ID number for reference sequence
133 int32_t Position; // position (0-based) where alignment starts
134 uint16_t Bin; // BAM (standard) index bin number for this alignment
135 uint16_t MapQuality; // mapping quality score
136 uint32_t AlignmentFlag; // alignment bit-flag (use provided methods to query/modify)
137 std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
138 int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
139 int32_t MatePosition; // position (0-based) where alignment's mate starts
140 int32_t InsertSize; // mate-pair insert size
141 std::string Filename; // name of BAM file which this alignment comes from
142
144 // internal utility methods
145private:
146 bool FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength,
147 unsigned int& numBytesParsed) const;
148 bool IsValidSize(const std::string& tag, const std::string& type) const;
149 void SetErrorString(const std::string& where, const std::string& what) const;
150 bool SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed) const;
151
153 // internal data
154private:
155 struct BamAlignmentSupportData
156 {
157
159 // data members
160 std::string AllCharData;
161 uint32_t BlockLength;
162 uint32_t NumCigarOperations;
163 uint32_t QueryNameLength;
164 uint32_t QuerySequenceLength;
165 bool HasCoreOnly;
166
168 // constructor
169 BamAlignmentSupportData()
170 : BlockLength(0)
171 , NumCigarOperations(0)
172 , QueryNameLength(0)
173 , QuerySequenceLength(0)
174 , HasCoreOnly(false)
175 {}
176 };
177 BamAlignmentSupportData SupportData;
178 friend class Internal::BamReaderPrivate;
179 friend class Internal::BamWriterPrivate;
180
181 mutable std::string ErrorString; // mutable to allow updates even in logically const methods
182};
183
184// ---------------------------------------------------------
185// BamAlignment tag access methods
186
198template <typename T>
199bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const T& value)
200{
201
202 // if char data not populated, do that first
203 if (SupportData.HasCoreOnly) {
205 }
206
207 // check tag/type size
208 if (!IsValidSize(tag, type)) {
209 // TODO: set error string?
210 return false;
211 }
212
213 // check that storage type code is OK for T
214 if (!TagTypeHelper<T>::CanConvertTo(type.at(0))) {
215 // TODO: set error string?
216 return false;
217 }
218
219 // localize the tag data
220 char* pTagData = (char*)TagData.data();
221 const unsigned int tagDataLength = TagData.size();
222 unsigned int numBytesParsed = 0;
223
224 // if tag already exists, return false
225 // use EditTag explicitly instead
226 if (FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
227 // TODO: set error string?
228 return false;
229 }
230
231 // otherwise, convert value to string
232 union
233 {
234 T value;
235 char valueBuffer[sizeof(T)];
236 } un;
237 un.value = value;
238
239 // copy original tag data to temp buffer
240 const std::string newTag = tag + type;
241 const std::size_t newTagDataLength =
242 tagDataLength + newTag.size() + sizeof(T); // leave room for new T
243 RaiiBuffer originalTagData(newTagDataLength);
244 std::memcpy(originalTagData.Buffer, TagData.c_str(),
245 tagDataLength + 1); // '+1' for TagData null-term
246
247 // append newTag
248 std::strcat(originalTagData.Buffer + tagDataLength, newTag.data());
249 std::memcpy(originalTagData.Buffer + tagDataLength + newTag.size(), un.valueBuffer, sizeof(T));
250
251 // store temp buffer back in TagData
252 const char* newTagData = (const char*)originalTagData.Buffer;
253 TagData.assign(newTagData, newTagDataLength);
254 return true;
255}
256
257template <>
258inline bool BamAlignment::AddTag<std::string>(const std::string& tag, const std::string& type,
259 const std::string& value)
260{
261 // if char data not populated, do that first
262 if (SupportData.HasCoreOnly) {
263 BuildCharData();
264 }
265
266 // check tag/type size
267 if (!IsValidSize(tag, type)) {
268 // TODO: set error string?
269 return false;
270 }
271
272 // check that storage type code is OK for string
273 if (!TagTypeHelper<std::string>::CanConvertTo(type.at(0))) {
274 // TODO: set error string?
275 return false;
276 }
277
278 // localize the tag data
279 char* pTagData = (char*)TagData.data();
280 const unsigned int tagDataLength = TagData.size();
281 unsigned int numBytesParsed = 0;
282
283 // if tag already exists, return false
284 // use EditTag explicitly instead
285 if (FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
286 // TODO: set error string?
287 return false;
288 }
289
290 // otherwise, copy tag data to temp buffer
291 const std::string newTag = tag + type + value;
292 const std::size_t newTagDataLength =
293 tagDataLength + newTag.size() + 1; // leave room for null-term
294 RaiiBuffer originalTagData(newTagDataLength);
295 std::memcpy(originalTagData.Buffer, TagData.c_str(),
296 tagDataLength + 1); // '+1' for TagData null-term
297
298 // append newTag (removes original null-term, then appends newTag + null-term)
299 std::strcat(originalTagData.Buffer + tagDataLength, newTag.data());
300
301 // store temp buffer back in TagData
302 const char* newTagData = (const char*)originalTagData.Buffer;
303 TagData.assign(newTagData, newTagDataLength);
304 return true;
305}
306
317template <typename T>
318bool BamAlignment::AddTag(const std::string& tag, const std::vector<T>& values)
319{
320
321 // if char data not populated, do that first
322 if (SupportData.HasCoreOnly) {
324 }
325
326 // check for valid tag name length
327 if (tag.size() != Constants::BAM_TAG_TAGSIZE) {
328 return false;
329 }
330
331 // localize the tag data
332 char* pTagData = (char*)TagData.data();
333 const unsigned int tagDataLength = TagData.size();
334 unsigned int numBytesParsed = 0;
335
336 // if tag already exists, return false
337 // use EditTag explicitly instead
338 if (FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
339 // TODO: set error string?
340 return false;
341 }
342
343 // build new tag's base information
344 char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE];
345 std::memcpy(newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE);
346 newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY;
347 newTagBase[3] = TagTypeHelper<T>::TypeCode();
348
349 // add number of array elements to newTagBase
350 const int32_t numElements = values.size();
351 std::memcpy(newTagBase + 4, &numElements, sizeof(int32_t));
352
353 // copy current TagData string to temp buffer, leaving room for new tag's contents
354 const std::size_t newTagDataLength =
355 tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE + numElements * sizeof(T);
356 RaiiBuffer originalTagData(newTagDataLength);
357 std::memcpy(originalTagData.Buffer, TagData.c_str(),
358 tagDataLength + 1); // '+1' for TagData's null-term
359
360 // write newTagBase (removes old null term)
361 std::strcat(originalTagData.Buffer + tagDataLength, (const char*)newTagBase);
362
363 // add vector elements to tag
364 int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE;
365 for (int i = 0; i < numElements; ++i) {
366 const T& value = values.at(i);
367 std::memcpy(originalTagData.Buffer + elementsBeginOffset + i * sizeof(T), &value,
368 sizeof(T));
369 }
370
371 // store temp buffer back in TagData
372 const char* newTagData = (const char*)originalTagData.Buffer;
373 TagData.assign(newTagData, newTagDataLength);
374 return true;
375}
376
391template <typename T>
392bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const T& value)
393{
394
395 // if char data not populated, do that first
396 if (SupportData.HasCoreOnly) {
398 }
399
400 // remove existing tag if present, then append tag with new value
401 if (HasTag(tag)) {
402 RemoveTag(tag);
403 }
404 return AddTag(tag, type, value);
405}
406
418template <typename T>
419bool BamAlignment::EditTag(const std::string& tag, const std::vector<T>& values)
420{
421
422 // if char data not populated, do that first
423 if (SupportData.HasCoreOnly) {
425 }
426
427 // remove existing tag if present, then append tag with new values
428 if (HasTag(tag)) {
429 RemoveTag(tag);
430 }
431 return AddTag(tag, values);
432}
433
441template <typename T>
442bool BamAlignment::GetTag(const std::string& tag, T& destination) const
443{
444
445 // skip if alignment is core-only
446 if (SupportData.HasCoreOnly) {
447 // TODO: set error string?
448 return false;
449 }
450
451 // skip if no tags present
452 if (TagData.empty()) {
453 // TODO: set error string?
454 return false;
455 }
456
457 // localize the tag data
458 char* pTagData = (char*)TagData.data();
459 const unsigned int tagDataLength = TagData.size();
460 unsigned int numBytesParsed = 0;
461
462 // return failure if tag not found
463 if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
464 // TODO: set error string?
465 return false;
466 }
467
468 // fetch data type
469 const char type = *(pTagData - 1);
470 if (!TagTypeHelper<T>::CanConvertFrom(type)) {
471 // TODO: set error string ?
472 return false;
473 }
474
475 // determine data length
476 int destinationLength = 0;
477 switch (type) {
478
479 // 1 byte data
483 destinationLength = 1;
484 break;
485
486 // 2 byte data
489 destinationLength = 2;
490 break;
491
492 // 4 byte data
496 destinationLength = 4;
497 break;
498
499 // var-length types not supported for numeric destination
503 SetErrorString("BamAlignment::GetTag",
504 "cannot store variable length tag data into a numeric destination");
505 return false;
506
507 // unrecognized tag type
508 default:
509 const std::string message = std::string("invalid tag type: ") + type;
510 SetErrorString("BamAlignment::GetTag", message);
511 return false;
512 }
513
514 // store data in destination
515 destination = 0;
516 std::memcpy(&destination, pTagData, destinationLength);
517
518 // return success
519 return true;
520}
521
522template <>
523inline bool BamAlignment::GetTag<std::string>(const std::string& tag,
524 std::string& destination) const
525{
526 // skip if alignment is core-only
527 if (SupportData.HasCoreOnly) {
528 // TODO: set error string?
529 return false;
530 }
531
532 // skip if no tags present
533 if (TagData.empty()) {
534 // TODO: set error string?
535 return false;
536 }
537
538 // localize the tag data
539 char* pTagData = (char*)TagData.data();
540 const unsigned int tagDataLength = TagData.size();
541 unsigned int numBytesParsed = 0;
542
543 // return failure if tag not found
544 if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
545 // TODO: set error string?
546 return false;
547 }
548
549 // otherwise copy data into destination
550 const unsigned int dataLength = std::strlen(pTagData);
551 destination.clear();
552 destination.resize(dataLength);
553 std::memcpy((char*)destination.data(), pTagData, dataLength);
554
555 // return success
556 return true;
557}
558
566template <typename T>
567bool BamAlignment::GetTag(const std::string& tag, std::vector<T>& destination) const
568{
569
570 // skip if alignment is core-only
571 if (SupportData.HasCoreOnly) {
572 // TODO: set error string?
573 return false;
574 }
575
576 // skip if no tags present
577 if (TagData.empty()) {
578 // TODO: set error string?
579 return false;
580 }
581
582 // localize the tag data
583 char* pTagData = (char*)TagData.data();
584 const unsigned int tagDataLength = TagData.size();
585 unsigned int numBytesParsed = 0;
586
587 // return false if tag not found
588 if (!FindTag(tag, pTagData, tagDataLength, numBytesParsed)) {
589 // TODO: set error string?
590 return false;
591 }
592
593 // check that tag is array type
594 const char tagType = *(pTagData - 1);
595 if (tagType != Constants::BAM_TAG_TYPE_ARRAY) {
596 SetErrorString("BamAlignment::GetTag", "cannot store a non-array tag in array destination");
597 return false;
598 }
599
600 // fetch element type
601 const char elementType = *pTagData;
602 if (!TagTypeHelper<T>::CanConvertFrom(elementType)) {
603 // TODO: set error string ?
604 return false;
605 }
606 ++pTagData;
607
608 // calculate length of each element in tag's array
609 switch (elementType) {
613 break;
614
617 break;
618
622 break;
623
624 // var-length types not supported for numeric destination
628 SetErrorString("BamAlignment::GetTag",
629 "invalid array data, variable-length elements are not allowed");
630 return false;
631
632 // unknown tag type
633 default:
634 const std::string message = std::string("invalid array element type: ") + elementType;
635 SetErrorString("BamAlignment::GetTag", message);
636 return false;
637 }
638
639 // get number of elements
640 int32_t numElements;
641 std::memcpy(&numElements, pTagData, sizeof(int32_t));
642 pTagData += 4;
643 destination.clear();
644 destination.reserve(numElements);
645
646 // read in elements
647 T value;
648 for (int i = 0; i < numElements; ++i) {
649 std::memcpy(&value, pTagData, sizeof(T));
650 pTagData += sizeof(T);
651 destination.push_back(value);
652 }
653
654 // return success
655 return true;
656}
657
658typedef std::vector<BamAlignment> BamAlignmentVector;
659
660} // namespace BamTools
661
662#endif // BAMALIGNMENT_H
The main BAM alignment data structure.
Definition BamAlignment.h:34
std::string AlignedBases
'aligned' sequence (includes any indels, padding, clipping)
Definition BamAlignment.h:129
bool BuildCharData()
Populates alignment string fields (read name, bases, qualities, tag data).
Definition BamAlignment.cpp:108
std::string TagData
tag data (use the provided methods to query/modify)
Definition BamAlignment.h:131
std::string Qualities
FASTQ qualities (ASCII characters, not numeric values)
Definition BamAlignment.h:130
bool GetTag(const std::string &tag, T &destination) const
Definition BamAlignment.h:442
int32_t InsertSize
mate-pair insert size
Definition BamAlignment.h:140
std::vector< CigarOp > CigarData
CIGAR operations for this alignment.
Definition BamAlignment.h:137
int32_t Position
position (0-based) where alignment starts
Definition BamAlignment.h:133
bool AddTag(const std::string &tag, const std::string &type, const T &value)
Definition BamAlignment.h:199
uint32_t AlignmentFlag
alignment bit-flag (use the provided methods to query/modify)
Definition BamAlignment.h:136
int32_t RefID
ID number for reference sequence.
Definition BamAlignment.h:132
uint16_t MapQuality
mapping quality score
Definition BamAlignment.h:135
std::string Filename
name of BAM file which this alignment comes from
Definition BamAlignment.h:141
int32_t MateRefID
ID number for reference sequence where alignment's mate was aligned.
Definition BamAlignment.h:138
uint16_t Bin
BAM (standard) index bin number for this alignment.
Definition BamAlignment.h:134
int32_t Length
length of query sequence
Definition BamAlignment.h:126
int32_t MatePosition
position (0-based) where alignment's mate starts
Definition BamAlignment.h:139
bool HasTag(const std::string &tag) const
Returns true if alignment has a record for requested tag.
Definition BamAlignment.cpp:723
std::string QueryBases
'original' sequence (as reported from sequencing machine)
Definition BamAlignment.h:127
std::string Name
read name
Definition BamAlignment.h:125
void RemoveTag(const std::string &tag)
Removes field from BAM tags.
Definition BamAlignment.cpp:856
bool EditTag(const std::string &tag, const std::string &type, const T &value)
Definition BamAlignment.h:392
const char BAM_TAG_TYPE_UINT8
Definition BamConstants.h:76
const char BAM_TAG_TYPE_HEX
Definition BamConstants.h:83
const char BAM_TAG_TYPE_INT32
Definition BamConstants.h:79
const char BAM_TAG_TYPE_ASCII
Definition BamConstants.h:74
const uint8_t BAM_TAG_TAGSIZE
Definition BamConstants.h:86
const char BAM_TAG_TYPE_ARRAY
Definition BamConstants.h:84
const char BAM_TAG_TYPE_FLOAT
Definition BamConstants.h:81
const char BAM_TAG_TYPE_UINT32
Definition BamConstants.h:80
const char BAM_TAG_TYPE_STRING
Definition BamConstants.h:82
const char BAM_TAG_TYPE_INT8
Definition BamConstants.h:75
const char BAM_TAG_TYPE_UINT16
Definition BamConstants.h:78
const char BAM_TAG_TYPE_INT16
Definition BamConstants.h:77
const uint8_t BAM_TAG_ARRAYBASE_SIZE
Definition BamConstants.h:88
Contains all BamTools classes & methods.
Definition Sort.h:24
std::vector< BamAlignment > BamAlignmentVector
Definition BamAlignment.h:658