libpappsomspp
Library for mass spectrometry
Loading...
Searching...
No Matches
bafasciifilereader.cpp
Go to the documentation of this file.
1/////////////////////// StdLib includes
2#include <iostream>
3#include <iomanip>
4
5
6/////////////////////// Qt includes
7#include <QDebug>
8#include <QFile>
9#include <QFileInfo>
10
11
12/////////////////////// libpwiz includes
13#include <pwiz/data/msdata/DefaultReaderList.hpp>
14
15
16/////////////////////// Local includes
17#include "bafasciifilereader.h"
19#include "../utils.h"
20#include "../types.h"
21#include "../msrun/msrunid.h"
22
23
24namespace pappso
25{
26
27static const std::size_t CHECKED_LINES_COUNT = 10;
28
30 : MsFileReader{file_name}
31{
32 // To avoid initializing multiple times (costly process), we
33 // only initialize when needed, that is, upon getMsRunIds().
34 // initialize();
35}
36
37
41
42bool
43BafAsciiFileReader::initialize(std::size_t &line_count)
44{
45 // Here we just test some the lines of the file to check that they comply with
46 // the brukerBafAscii format.
47
48 line_count = 0;
49
50 QFile file(m_fileName);
51
52 if(!file.open(QFile::ReadOnly | QFile::Text))
53 {
54 qDebug() << "Failed to open file" << m_fileName;
55
56 return false;
57 }
58
59 // Construct the regular expression pattern, piecemeal...
60
61 // The retention time as the very first value in the line.
62
63 QString regexp_pattern = QString("^(%1)").arg(
65
66 // The ionization mode (positive or negative)
67 regexp_pattern += QString(",([+-])");
68
69 regexp_pattern += QString(",(ESI|MALDI)");
70
71 // The MS level (ms1 for full scan mass spectrum)
72 regexp_pattern += QString(",ms(\\d)");
73
74 // Do no know what this is for.
75 regexp_pattern += QString(",(-)");
76
77 // The type of peak (profile or centroid).
78 regexp_pattern += QString(",(profile|line)");
79
80 // The m/z range of the mass spectrum.
81
82 regexp_pattern +=
83 QString(",(%1-%2)")
86
87 // The count of peaks following this element in the remaining of the line.
88
89 regexp_pattern += QString(",(\\d+)");
90
91 regexp_pattern += QString("(.*$)");
92
93 // qDebug() << "The full regexp_pattern:" << regexp_pattern;
94
95 QRegularExpression line_regexp(regexp_pattern);
96
97 QRegularExpressionMatch regexp_match;
98
99 QString line;
100 bool file_reading_failed = false;
101 bool ok = false;
102
103 // Reading, parsing and checking lines is extremely time consuming.
104 // What we want here is reduce the time all the file's lines are
105 // read. We could say that we want to parse and check the first
106 // CHECKED_LINES_COUNT lines and then avoid parsing and checking, just go
107 // through the lines. At the end of the file, the number of lines that have
108 // been read is stored in the out parameter line_count.
109 std::size_t iter = 0;
110
111 while(!file.atEnd())
112 {
113 line = file.readLine().trimmed();
114
115 ++iter;
116 // qDebug() << "Read one line more: (not yet checked)" << iter;
117 if(iter > CHECKED_LINES_COUNT)
118 continue;
119
120 if(line.startsWith('#') || line.isEmpty() ||
121 Utils::endOfLineRegExp.match(line).hasMatch())
122 continue;
123
124 // qDebug() << "Current brukerBafAscii format line " << line_count << ": "
125 // << line.left(30) << " ... " << line.right(30);
126
127 regexp_match = line_regexp.match(line);
128
129 if(regexp_match.hasMatch())
130 {
131 // qDebug() << "The match succeeded.";
132
133 regexp_match.captured(1).toDouble(&ok);
134 if(!ok)
135 {
136 qDebug()
137 << "Failed to extract the retention time of the mass spectrum.";
138
139 file_reading_failed = true;
140
141 break;
142 }
143
144 QString ionization_mode = regexp_match.captured(2);
145 QString source_type = regexp_match.captured(3);
146
147 regexp_match.captured(4).toInt(&ok);
148 if(!ok)
149 {
150 qDebug()
151 << "Failed to extract the MS level of the mass spectrum.";
152
153 file_reading_failed = true;
154
155 break;
156 }
157
158 QString peak_shape_type = regexp_match.captured(6);
159
160 QString mz_range = regexp_match.captured(7);
161
162 mz_range.left(mz_range.indexOf("-")).toDouble(&ok);
163 if(!ok)
164 {
165 qDebug() << "Failed to extract the start of the m/z range.";
166
167 file_reading_failed = true;
168
169 break;
170 }
171
172 mz_range.right(mz_range.indexOf("-") + 1).toDouble(&ok);
173 if(!ok)
174 {
175 qDebug() << "Failed to extract the end of the m/z range.";
176
177 file_reading_failed = true;
178
179 break;
180 }
181
182 // qDebug() << qSetRealNumberPrecision(10)
183 // << "mz_range_start: " << mz_range_start
184 // << "mz_range_end: " << mz_range_end;
185
186 int peak_count = regexp_match.captured(8).toInt(&ok);
187 if(!ok)
188 {
189 qDebug() << "Failed to extract the number of peaks in the mass "
190 "spectrum.";
191
192 file_reading_failed = true;
193
194 break;
195 }
196
197 QString peaks = regexp_match.captured(9);
198 QStringList peaks_stringlist = peaks.split(",", Qt::SkipEmptyParts);
199
200 // qDebug() << "The number of peaks:" << peaks_stringlist.size();
201
202 // Sanity check:
203 if(peaks_stringlist.size() != peak_count)
204 {
205 // qDebug() << "The number of peaks in the mass spectrum does not
206 // "
207 // "match the advertised one.";
208
209 file_reading_failed = true;
210
211 break;
212 }
213
214 // qDebug() << "The retention time:" << retention_time
215 // << "the ionization mode: " << ionization_mode
216 // << "the source type: " << source_type
217 // << "MS level is:" << ms_level
218 // << "peak shape type: " << peak_shape_type
219 // << "m/z range: " << mz_range << "peak count: " <<
220 // peak_count
221 // << "and peaks: " << peaks.left(100) << " ... "
222 // << peaks.right(100) << "";
223
224 // If we are here, that means that the read line has conformed
225 // to the format expected.
226 ++line_count;
227 // qDebug() << "Checked one line more:" << line_count;
228 }
229 // End end of
230 // if(regexp_match.hasMatch())
231 else
232 {
233 qDebug() << "The match failed.";
234 file_reading_failed = true;
235
236 break;
237 }
238 }
239 // End of
240 // while(!file.atEnd())
241
242 file.close();
243
244 if(!file_reading_failed && line_count >= 1)
245 {
247 return true;
248 }
249
251
252 // qDebug() << "The number of parsed mass spectra: " << line_count;
253
254 // qDebug() << "Detected file format:"
255 // << Utils::msDataFormatAsString(m_fileFormat)
256 // << "with number of spectra: " << line_count;
257
258 return false;
259}
260
261
267
268
269std::vector<MsRunIdCstSPtr>
270BafAsciiFileReader::getMsRunIds(const QString &run_prefix)
271{
272 std::vector<MsRunIdCstSPtr> ms_run_ids;
273
274 std::size_t ms_data_line_count = 0;
275
276 if(!initialize(ms_data_line_count))
277 return ms_run_ids;
278
279 // Finally create the MsRunId with the file name.
280 MsRunId ms_run_id(m_fileName);
281 ms_run_id.setMsDataFormat(m_fileFormat);
282
283 // We need to set the unambiguous xmlId string.
284 ms_run_id.setXmlId(
285 QString("%1%2").arg(run_prefix).arg(Utils::getLexicalOrderedString(0)));
286
287 // Craft a meaningful sample name because otherwise all the files loaded from
288 // text files will have the same sample name and it will be difficult to
289 // differentiate them.
290 // Orig version:
291 // ms_run_id.setRunId("Single spectrum");
292 // Now the sample name is nothing but the file name without the path.
293
294 QFileInfo file_info(m_fileName);
295
296 // qDebug() << "file name:" << m_fileName;
297
298 QString sample_name = file_info.fileName();
299
300 // qDebug() << "sample name:" << sample_name;
301
302 ms_run_id.setRunId(sample_name);
303
304 // Now set the sample name to the run id:
305
306 ms_run_id.setSampleName(ms_run_id.getRunId());
307
308 // qDebug() << __FILE__ << "@" << __LINE__ << __FUNCTION__ << "()"
309 //<< "Current ms_run_id:" << ms_run_id.toString();
310
311 // Finally make a shared pointer out of it and append it to the vector.
312 ms_run_ids.push_back(std::make_shared<MsRunId>(ms_run_id));
313
314 return ms_run_ids;
315}
316
317
318} // namespace pappso
virtual MsDataFormat getFileFormat() override
virtual bool initialize(std::size_t &line_count)
virtual std::vector< MsRunIdCstSPtr > getMsRunIds(const QString &run_prefix) override
BafAsciiFileReader(const QString &file_name)
MsDataFormat m_fileFormat
MS run identity MsRunId identifies an MS run with a unique ID (XmlId) and contains eventually informa...
Definition msrunid.h:54
const QString & getRunId() const
Definition msrunid.cpp:130
void setRunId(const QString &run_id)
Definition msrunid.cpp:123
void setXmlId(const QString &xml_id)
set an XML unique identifier for this MsRunId
Definition msrunid.cpp:137
void setMsDataFormat(MsDataFormat format)
Definition msrunid.cpp:158
void setSampleName(const QString &name)
set a sample name for this MsRunId
Definition msrunid.cpp:79
static QRegularExpression unsignedDoubleNumberNoExponentialRegExp
Definition utils.h:53
static const QString getLexicalOrderedString(unsigned int num)
Definition utils.cpp:74
static QRegularExpression endOfLineRegExp
Regular expression that tracks the end of line in text files.
Definition utils.h:68
tries to keep as much as possible monoisotopes, removing any possible C13 peaks and changes multichar...
Definition aa.cpp:39
MsDataFormat
Definition types.h:120
@ unknown
unknown format
static const std::size_t CHECKED_LINES_COUNT
This header contains all the type re-definitions and all the global variables definitions used in the...