libpappsomspp
Library for mass spectrometry
Loading...
Searching...
No Matches
enzyme.cpp
Go to the documentation of this file.
1/*******************************************************************************
2 * Copyright (c) 2015 Olivier Langella <Olivier.Langella@moulon.inra.fr>.
3 *
4 * This file is part of the PAPPSOms++ library.
5 *
6 * PAPPSOms++ is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * PAPPSOms++ is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with PAPPSOms++. If not, see <http://www.gnu.org/licenses/>.
18 *
19 * Contributors:
20 * Olivier Langella <Olivier.Langella@moulon.inra.fr> - initial API and
21 *implementation
22 ******************************************************************************/
23
24#include "enzyme.h"
25#include <QStringList>
26#include <QDebug>
28// #include <iostream>
29
30namespace pappso
31{
33{
34 m_recognitionSite.setPattern("([KR])([^P])");
35 m_miscleavage = 0;
36
37
38 char vv1[] = {'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
39 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'};
40 m_wildCardX.assign(std::begin(vv1), std::end(vv1));
41
42 char vv2[] = {'N', 'D'};
43 m_wildCardB.assign(std::begin(vv2), std::end(vv2));
44
45 char vv3[] = {'Q', 'E'};
46 m_wildCardZ.assign(std::begin(vv3), std::end(vv3));
47}
48
49Enzyme::Enzyme(const QString &recognition_site)
50{
51 m_recognitionSite.setPattern(recognition_site);
52 m_miscleavage = 0;
53
54
55 char vv1[] = {'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
56 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'};
57 m_wildCardX.assign(std::begin(vv1), std::end(vv1));
58
59 char vv2[] = {'N', 'D'};
60 m_wildCardB.assign(std::begin(vv2), std::end(vv2));
61
62 char vv3[] = {'Q', 'E'};
63 m_wildCardZ.assign(std::begin(vv3), std::end(vv3));
64}
65
69
70void
71Enzyme::setMiscleavage(unsigned int miscleavage)
72{
73 m_miscleavage = miscleavage;
74}
75unsigned int
77{
78 return m_miscleavage;
79}
80void
81Enzyme::setMaxPeptideVariantListSize(std::size_t max_peptide_variant_list_size)
82{
83 m_maxPeptideVariantListSize = max_peptide_variant_list_size;
84}
85
86void
87Enzyme::eat(std::int8_t sequence_database_id,
88 const ProteinSp &protein_sp,
89 bool is_decoy,
90 EnzymeProductInterface &enzyme_product) const
91{
92 /*
93 * for aa in self.aa_to_cut:
94 seq = seq.replace(aa, aa + ' ')
95 seq_stack = []
96 for s in seq.strip().split(' '):
97 seq_stack.append(s)
98 if len(seq_stack) > self.misscleavage + 1:
99 seq_stack.pop(0)
100 s2 = ""
101 for s_miss in seq_stack[::-1]:
102 s2 = s_miss + s2
103 yield s2
104 */
105 qDebug() << "Enzyme::eat begin ";
106 const QString sequence = protein_sp.get()->getSequence();
107 qDebug() << sequence;
108 QStringList peptide_list;
109 int pos = 0;
110 int peptide_start = 0;
111 int peptide_size = sequence.size();
112 QRegularExpressionMatch match_recognition_site =
113 m_recognitionSite.match(sequence, pos);
114 while(match_recognition_site.hasMatch())
115 {
116 pos = match_recognition_site.capturedStart(0);
117 peptide_size =
118 pos + match_recognition_site.captured(1).length() - peptide_start;
119 // qDebug() << "pos=" << pos << " peptide_start=" << peptide_start << "
120 // peptide_size=" << peptide_size << " " <<
121 // sequence.mid(peptide_start,peptide_size);
122 if(peptide_size > 0)
123 {
124 peptide_list.append(sequence.mid(peptide_start, peptide_size));
125 }
126 peptide_start += peptide_size;
127 pos = peptide_start; // all peptides MUST be consecutive
128 match_recognition_site = m_recognitionSite.match(sequence, pos);
129 }
130 peptide_size = sequence.size() - peptide_start;
131 if(peptide_size > 0)
132 {
133 peptide_list.append(sequence.mid(peptide_start, peptide_size));
134 }
135
136 unsigned int start = 1;
137 bool is_nter = true;
138 foreach(const QString &peptide, peptide_list)
139 {
140 // enzyme_product.setPeptide(sequence_database_id, protein_sp,is_decoy,
141 // peptide, start,is_nter,0, false);
142 sanityCheck(enzyme_product,
143 sequence_database_id,
144 protein_sp,
145 is_decoy,
146 peptide,
147 start,
148 is_nter,
149 0,
150 false);
151 is_nter = false;
152 start += peptide.size();
153 }
154
155 unsigned int miscleavage_i = 0;
156 while(miscleavage_i < m_miscleavage)
157 {
158 miscleavage_i++;
159 qDebug() << "miscleavage_i=" << miscleavage_i;
160 int chunk_number = miscleavage_i + 1;
161 unsigned int start = 1;
162 bool is_nter = true;
163
164 for(auto i = 0; i < peptide_list.size(); ++i)
165 {
166 qDebug() << "start=" << start;
167 QStringList peptide_mis_list;
168 for(auto j = 0; (j < chunk_number) && ((i + j) < peptide_list.size());
169 j++)
170 {
171 peptide_mis_list << peptide_list.at(i + j);
172 }
173 if(peptide_mis_list.size() == chunk_number)
174 {
175 // enzyme_product.setPeptide(sequence_database_id,
176 // protein_sp,is_decoy, peptide_mis_list.join(""), start,is_nter,
177 // miscleavage_i, false);
178 sanityCheck(enzyme_product,
179 sequence_database_id,
180 protein_sp,
181 is_decoy,
182 peptide_mis_list.join(""),
183 start,
184 is_nter,
185 miscleavage_i,
186 false);
187 }
188 is_nter = false;
189 start += peptide_list.at(i).size();
190 }
191 }
192}
193
194void
195Enzyme::replaceWildcards(std::vector<std::string> *p_peptide_variant_list) const
196{
197 std::string new_peptide = p_peptide_variant_list->at(0);
198 qDebug() << "Enzyme::replaceWildcards begin " << new_peptide.c_str();
199 std::vector<std::string> old_peptide_variant_list;
200 old_peptide_variant_list.assign(p_peptide_variant_list->begin(),
201 p_peptide_variant_list->end());
202
203
204 for(char wildcard : {'X', 'B', 'Z'})
205 {
206
207 std::size_t position = new_peptide.find(wildcard);
208 if(position == std::string::npos)
209 {
210 continue;
211 }
212 else
213 {
214 p_peptide_variant_list->clear();
215 /*
216 new_peptide[position] = 'A';
217 p_peptide_variant_list->push_back(new_peptide);
218 break;
219 */
220
221 const std::vector<char> *p_x_replace_wildcard = nullptr;
222 if(wildcard == 'X')
223 {
224 p_x_replace_wildcard = &m_wildCardX;
225 }
226 else if(wildcard == 'B')
227 {
228 p_x_replace_wildcard = &m_wildCardB;
229 }
230 else if(wildcard == 'Z')
231 {
232 p_x_replace_wildcard = &m_wildCardZ;
233 }
234
235 if(p_x_replace_wildcard != nullptr)
236 {
237 for(std::string orig_peptide : old_peptide_variant_list)
238 {
239 for(char replace : *p_x_replace_wildcard)
240 {
241 orig_peptide[position] = replace;
242 p_peptide_variant_list->push_back(orig_peptide);
243 }
244 }
245 }
246 else
247 {
249 QObject::tr("x_replace_wildcard is empty"));
250 }
251 // new_peptide[position] = 'A';
252 // p_peptide_variant_list->push_back(new_peptide);
253 // p_peptide_variant_list->resize(1);
254 // std::cerr << "Enzyme::replaceWildcards begin
255 // p_peptide_variant_list.size()=" << p_peptide_variant_list->size()
256 // <<
257 // endl;
258 break;
259 }
260 }
261 std::vector<std::string>().swap(
262 old_peptide_variant_list); // clear old_peptide_variant_list reallocating
263
264
265 qDebug() << "Enzyme::replaceWildcards end " << new_peptide.c_str();
266}
267
268void
269Enzyme::setTakeOnlyFirstWildcard(bool take_only_first_wildcard)
270{
271 m_takeOnlyFirstWildcard = take_only_first_wildcard;
272}
273
274
275void
277 std::int8_t sequence_database_id,
278 const ProteinSp &protein_sp,
279 bool is_decoy,
280 const PeptideStr &peptide,
281 unsigned int start,
282 bool is_nter,
283 unsigned int missed_cleavage_number,
284 bool semi_enzyme) const
285{
286 if(peptide.contains('X') || peptide.contains('B') || peptide.contains('Z'))
287 {
288
289 std::vector<std::string> peptide_variant_list;
290 peptide_variant_list.push_back(peptide.toStdString());
291
292 while((peptide_variant_list.at(0).find('X') != std::string::npos) ||
293 (peptide_variant_list.at(0).find('B') != std::string::npos) ||
294 (peptide_variant_list.at(0).find('Z') != std::string::npos))
295 {
296 replaceWildcards(&peptide_variant_list);
297 if(peptide_variant_list.size() > m_maxPeptideVariantListSize)
298 {
299 peptide_variant_list.resize(m_maxPeptideVariantListSize);
300 peptide_variant_list.shrink_to_fit();
301 }
302 }
303
304 // peptide_variant_list.resize(2);
306 {
307 enzyme_product.setPeptide(sequence_database_id,
308 protein_sp,
309 is_decoy,
310 QString(peptide_variant_list.at(0).c_str()),
311 start,
312 is_nter,
313 missed_cleavage_number,
314 semi_enzyme);
315 }
316 else
317 {
318 std::string peptide_variant = peptide_variant_list.back();
319 while(peptide_variant_list.size() > 0)
320 {
321 enzyme_product.setPeptide(sequence_database_id,
322 protein_sp,
323 is_decoy,
324 QString(peptide_variant.c_str()),
325 start,
326 is_nter,
327 missed_cleavage_number,
328 semi_enzyme);
329 peptide_variant_list.pop_back();
330 if(peptide_variant_list.size() > 0)
331 {
332 peptide_variant = peptide_variant_list.back();
333 }
334 }
335 }
336 std::vector<std::string>().swap(
337 peptide_variant_list); // clear peptide_variant_list reallocating
338 }
339 else
340 {
341 enzyme_product.setPeptide(sequence_database_id,
342 protein_sp,
343 is_decoy,
344 peptide,
345 start,
346 is_nter,
347 missed_cleavage_number,
348 semi_enzyme);
349 }
350}
351
352const QRegularExpression &
357} // namespace pappso
virtual void setPeptide(std::int8_t sequence_database_id, const ProteinSp &protein_sp, bool is_decoy, const PeptideStr &peptide, unsigned int start, bool is_nter, unsigned int missed_cleavage_number, bool semi_enzyme)=0
function to give the products of a protein digestion by an enzyme
QRegularExpression m_recognitionSite
example with a kinase == [K,R]
Definition enzyme.h:89
std::size_t m_maxPeptideVariantListSize
Definition enzyme.h:93
unsigned int getMiscleavage() const
get the maximum number of missed cleavage allowed in the digestion
Definition enzyme.cpp:76
Enzyme()
build the default enzyme (trypsin) with recognition_site = "([KR])([^P])"
Definition enzyme.cpp:32
void setMiscleavage(unsigned int miscleavage)
sets the maximum number of missed cleavage allowed in the digestion
Definition enzyme.cpp:71
std::vector< char > m_wildCardB
Definition enzyme.h:97
std::vector< char > m_wildCardZ
Definition enzyme.h:98
std::vector< char > m_wildCardX
Definition enzyme.h:96
void sanityCheck(EnzymeProductInterface &enzyme_product, std::int8_t sequence_database_id, const ProteinSp &protein_sp, bool is_decoy, const PeptideStr &peptide, unsigned int start, bool is_nter, unsigned int missed_cleavage_number, bool semi_enzyme) const
Definition enzyme.cpp:276
const QRegularExpression & getQRegExpRecognitionSite() const
Definition enzyme.cpp:353
void replaceWildcards(std::vector< std::string > *p_peptide_variant_list) const
Definition enzyme.cpp:195
void setTakeOnlyFirstWildcard(bool take_only_first_wildcard)
take only first m_takeOnlyFirstWildcard
Definition enzyme.cpp:269
void eat(std::int8_t sequence_database_id, const ProteinSp &protein_sp, bool is_decoy, EnzymeProductInterface &enzyme_product) const
digest a protein into enzyme products
Definition enzyme.cpp:87
unsigned int m_miscleavage
Definition enzyme.h:90
bool m_takeOnlyFirstWildcard
Definition enzyme.h:91
void setMaxPeptideVariantListSize(std::size_t max_peptide_variant_list_size)
if there are wildcards in the protein sequence : restrict the number of possible peptide sequences
Definition enzyme.cpp:81
tries to keep as much as possible monoisotopes, removing any possible C13 peaks and changes multichar...
Definition aa.cpp:39
QString PeptideStr
A type definition for PeptideStr.
Definition types.h:45
std::shared_ptr< const Protein > ProteinSp
shared pointer on a Protein object
Definition protein.h:47