libpappsomspp
Library for mass spectrometry
Loading...
Searching...
No Matches
psmspecpeptidoms.cpp
Go to the documentation of this file.
1/**
2 * \file pappsomspp/processing/cbor/psm/evalscan/psmspecglob.cpp
3 * \date 19/07/2025
4 * \author Olivier Langella
5 * \brief compute specglob alignment on scan's PSM
6 */
7
8/*******************************************************************************
9 * Copyright (c) 2025 Olivier Langella <Olivier.Langella@universite-paris-saclay.fr>.
10 *
11 * This file is part of PAPPSOms-tools.
12 *
13 * PAPPSOms-tools is free software: you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation, either version 3 of the License, or
16 * (at your option) any later version.
17 *
18 * PAPPSOms-tools is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with PAPPSOms-tools. If not, see <http://www.gnu.org/licenses/>.
25 *
26 ******************************************************************************/
27
28
29#include "psmspecpeptidoms.h"
30
39
41 pappso::cbor::CborStreamWriter *cbor_output_p,
42 const QJsonObject &parameters)
43 : PsmFileScanProcessAndCopy(buffer_scan_size, cbor_output_p, "SpecPeptidOms")
44{
45 m_specpeptidomsParameters = parameters;
46
47 if(parameters.value("fragment_tolerance_unit").toString() == "dalton")
48 {
50 parameters.value("fragment_tolerance").toDouble());
51 }
52 else if(parameters.value("fragment_tolerance_unit").toString() == "ppm")
53 {
55 pappso::PrecisionFactory::getPpmInstance(parameters.value("fragment_tolerance").toDouble());
56 }
57
58 QJsonObject spectrum_param = parameters.value("spectrum").toObject();
59
60 m_minimumMz = spectrum_param.value("minimum_mz").toDouble();
61 m_nMostIntense = spectrum_param.value("n_most_intense").toInt();
62 m_deisotope = spectrum_param.value("deisotope").toBool();
63
65
66
67 m_maxInterpretationsPerSpectrum = parameters.value("max_interpretations_per_spectrum").toInt();
68
70}
71
75
76void
78{
79
80
82 m_countScanProcessed += m_bufferScanSize;
83
84 monitor.setStatus(QObject::tr("%1 scan processed").arg(m_countScanProcessed));
85}
86
87void
89{
90 if(m_deisotope)
91 pappso::FilterChargeDeconvolution(m_fragmentTolerance).filter(mass_spectrum);
92 pappso::FilterResampleKeepGreater(m_minimumMz).filter(mass_spectrum);
93 pappso::FilterGreatestY(m_nMostIntense).filter(mass_spectrum);
94}
95
96void
98{
99 if(!m_decoyPrefix.isEmpty())
100 {
101 // generate decoy sequences
102
103 PsmProtein new_decoy_protein;
104 for(auto &it_protein : m_proteinMap.getProteinMap())
105 {
106 new_decoy_protein = it_protein.second;
107 new_decoy_protein.protein_sp =
108 std::make_shared<Protein>(*new_decoy_protein.protein_sp.get());
109 new_decoy_protein.protein_sp.get()->reverse();
110 new_decoy_protein.protein_sp.get()->setAccession(
111 m_decoyPrefix + new_decoy_protein.protein_sp.get()->getAccession());
112 new_decoy_protein.isTarget = false;
113 }
114 }
116}
117
118
119void
121{
122
123 QCborMap cbor_peptidoms_parameters = QCborValue::fromJsonValue(m_specpeptidomsParameters).toMap();
124
125 m_cborParameterMap.insert(QString("peptidoms"), cbor_peptidoms_parameters);
126
127 mp_cborOutput->append("parameter_map");
128 mp_cborOutput->writeCborMap(m_cborParameterMap);
129}
130
133{
134 return new PsmSpecPeptidOmsScan(*this, m_fragmentTolerance);
135}
136
137const pappso::AaCode &
139{
140 return m_aaCode;
141}
142
143
145 const pappso::cbor::psm::PsmSpecPeptidOms &psm_specpeptidoms,
146 pappso::PrecisionPtr fragment_tolerance)
147 : CborScanMapBase(psm_specpeptidoms)
148{
149 mp_psmSpecPeptidOms = &psm_specpeptidoms;
150
152}
153
157
158void
160{
161 // qWarning() << "PsmSpecPeptidOmsScan::process " << keys();
162 if(!keys().contains("id"))
163 {
164 throw pappso::PappsoException(QObject::tr("missing scan id"));
165 }
166 if(keys().contains("psm_list"))
167 {
168 QualifiedMassSpectrumSPtr qualified_mass_spectrum = getCurrentQualifiedMassSpectrumSPtr();
169
170 // qWarning() << "PsmSpecPeptidOmsScan::process "
171 // << qualified_mass_spectrum.get()->getMassSpectrumId().getSpectrumIndex();
172
173 mp_psmSpecPeptidOms->filterMassSpectrum(
174 *(qualified_mass_spectrum.get()->getMassSpectrumSPtr().get()));
175
176 pappso::specpeptidoms::SpOMSSpectrumCsp experimental_spectrum =
177 std::make_shared<pappso::specpeptidoms::SpOMSSpectrum>(
178 *qualified_mass_spectrum.get(),
179 mp_psmSpecPeptidOms->m_fragmentTolerance,
180 mp_psmSpecPeptidOms->getAaCode());
181
182
183 pappso::specpeptidoms::SemiGlobalAlignment semi_global_alignment(
184 mp_psmSpecPeptidOms->m_scoreValues,
185 mp_psmSpecPeptidOms->m_fragmentTolerance,
186 mp_psmSpecPeptidOms->m_aaCode);
187
188
189 QCborArray new_psm_arr;
190 for(QCborValue cbor_psm : value("psm_list").toArray())
191 {
192 QCborMap old_cbor_psm_map = cbor_psm.toMap();
193
194
195 if(!old_cbor_psm_map.keys().contains("proforma"))
196 {
198 QObject::tr("missing proforma in psm %1").arg(old_cbor_psm_map.keys().size()));
199 }
201 old_cbor_psm_map.value("proforma").toString());
202
203
204 pappso::specpeptidoms::SpOMSProtein protein(old_cbor_psm_map.value("proforma").toString(),
205 peptide_sp.get()->getSequence(),
206 mp_psmSpecPeptidOms->m_aaCode);
207
208 sequenceAlignment(false,
209 old_cbor_psm_map,
210 new_psm_arr,
211 experimental_spectrum,
212 semi_global_alignment,
213 &protein);
214
215 if(!m_decoyPrefix.isEmpty())
216 {
217 QString sequence = peptide_sp.get()->getSequence();
218 std::reverse(sequence.begin(), sequence.end());
219
221 m_decoyPrefix + old_cbor_psm_map.value("proforma").toString(),
222 sequence,
223 mp_psmSpecPeptidOms->m_aaCode);
224
225 sequenceAlignment(true,
226 old_cbor_psm_map,
227 new_psm_arr,
228 experimental_spectrum,
229 semi_global_alignment,
230 &protein_rev);
231 }
232
233
234 // qWarning() << "new_psm_arr.size()=" << new_psm_arr.size();
235 }
236
237 // insert(QString("psm_list"), new_psm_arr);
238 remove(QString("psm_list"));
239 insert(QString("psm_list"), new_psm_arr);
240
241 filterPsmListUniqueUniqueProforma();
242 }
243 // qWarning() << "PsmSpecPeptidOmsScan::process end";
244}
245
246
247void
249 bool is_reverse,
250 const QCborMap &old_cbor_psm_map,
251 QCborArray &new_psm_arr,
252 pappso::specpeptidoms::SpOMSSpectrumCsp &experimental_spectrum,
253 pappso::specpeptidoms::SemiGlobalAlignment &semi_global_alignment,
254 const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
255{
256
257 std::vector<pappso::specpeptidoms::Location> locations;
258 std::vector<double> potential_mass_errors;
259 const QString &sequence = protein_ptr->getSequence();
260 // qWarning() << "PsmSpecPeptidOmsScan::process " << peptide_sequence;
261 // do not take into account peptide containing too much redundancy
263 {
264 if((sequence.size() >= 8) &&
266 return;
267
268 semi_global_alignment.fastAlign(*experimental_spectrum.get(), protein_ptr);
269 // qDebug() << "Completed fastAlign";
270 locations = semi_global_alignment.getLocationSaver().getLocations();
271
272 qDebug() << "locations.size():" << locations.size();
273 for(auto loc : locations)
274 {
275 QCborMap new_cbor_psm;
276 qDebug() << "beginning=" << loc.beginning << "length=" << loc.length
277 << "tree=" << loc.tree << "score=" << loc.score
278 << "protein=" << loc.proteinPtr->getAccession();
279 semi_global_alignment.preciseAlign(
280 *experimental_spectrum.get(), loc.proteinPtr, loc.beginning, loc.length);
281 qDebug() << "Completed preciseAlign";
282
283 pappso::specpeptidoms::Alignment best_alignment =
284 semi_global_alignment.getBestAlignment();
285 /* qDebug() << "Best alignment" << best_alignment.interpretation
286 << best_alignment.score << "SPC" << best_alignment.SPC
287 << "beginning" << best_alignment.beginning << "end"
288 << best_alignment.end;*/
289 if(best_alignment.end > (std::size_t)sequence.size())
290 {
291 throw pappso::ExceptionOutOfRange(QString("best_alignment.end > "
292 "(std::size_t)sequence.size() : %1 %2")
293 .arg(best_alignment.end)
294 .arg(sequence.size()));
295 }
296 if(best_alignment.begin_shift > 0 || best_alignment.end_shift > 0 ||
297 best_alignment.shifts.size() > 0)
298 {
299 qDebug();
300 potential_mass_errors =
302 mp_psmSpecPeptidOms->m_aaCode, best_alignment, sequence);
303 qDebug();
304 semi_global_alignment.postProcessingAlign(*experimental_spectrum.get(),
305 loc.proteinPtr,
306 loc.beginning,
307 loc.length,
308 potential_mass_errors);
309
310 qDebug() << "semi_global_alignment.getBestAlignment()";
311 pappso::specpeptidoms::Alignment best_post_processed_alignment =
312 semi_global_alignment.getBestAlignment();
313 if(best_post_processed_alignment.SPC > best_alignment.SPC)
314 {
315 qDebug() << "Best post-processed alignment"
316 << best_post_processed_alignment.m_peptideModel.toInterpretation()
317 << best_post_processed_alignment.score << "SPC"
318 << best_post_processed_alignment.SPC;
319 storeAlignment(is_reverse,
320 old_cbor_psm_map,
321 new_cbor_psm,
322 protein_ptr->getAccession(),
323 best_post_processed_alignment);
324 }
325 else
326 {
327 qDebug() << "no improvement in post-processing";
328 storeAlignment(is_reverse,
329 old_cbor_psm_map,
330 new_cbor_psm,
331 protein_ptr->getAccession(),
332 best_alignment);
333 }
334 }
335 else
336 {
337
338 storeAlignment(is_reverse,
339 old_cbor_psm_map,
340 new_cbor_psm,
341 protein_ptr->getAccession(),
342 best_alignment);
343 }
344
345 if(!new_cbor_psm.isEmpty())
346 {
347 new_psm_arr.push_back(new_cbor_psm);
348 }
349 }
350 }
351}
352
353void
355{
356 std::size_t max_psm = mp_psmSpecPeptidOms->m_maxInterpretationsPerSpectrum;
357
358
359 struct sortPsmResults
360 {
361 qint64 score;
362 QCborMap psm;
363 };
364
365 QCborArray old_psm_arr = value("psm_list").toArray();
366 QCborArray new_psm_arr;
367 // for(QCborValue cbor_psm : old_psm_arr) {
368
369
370 std::vector<sortPsmResults> sort_psm_list;
371 for(auto it_psm : old_psm_arr)
372 {
373 QCborMap psm_map = it_psm.toMap();
374 qint64 score =
375 psm_map.value("eval").toMap().value("peptidoms").toMap().value("score").toInteger();
376 sort_psm_list.push_back({score, psm_map});
377 }
378 qDebug();
379 std::sort(sort_psm_list.begin(), sort_psm_list.end(), [](sortPsmResults &a, sortPsmResults &b) {
380 return a.score > b.score;
381 });
382
383
384 auto it_end = sort_psm_list.begin() + max_psm;
385
386
387 qDebug() << sort_psm_list.size();
388 for(auto it = sort_psm_list.begin(); it != sort_psm_list.end() && it != it_end; it++)
389 {
390
391 new_psm_arr.append(it->psm);
392 qDebug();
393 }
394
395 // qWarning() << new_psm_arr.size();
396 remove(QString("psm_list"));
397 insert(QString("psm_list"), new_psm_arr);
398}
399
400
401void
403 bool is_reverse,
404 const QCborMap &old_cbor_psm,
405 QCborMap &new_cbor_psm,
406 const QString &accession,
407 const pappso::specpeptidoms::Alignment &alignment)
408{
409 qDebug() << accession;
410
411 if(alignment.score > 0)
412 {
413
414 QString peptide_key(alignment.m_peptideModel.toProForma());
415
416
417 /*
418
419 {"proforma":"LHGAGADDADADD",
420 "protein_list":
421 [{
422 "accession": "GRMZM2G108267_P01",
423 "positions": [
424 337
425 ]
426 }
427 {
428 "accession": "GRMZM2G108267_P02",
429 "positions": [
430 337
431 ]
432 }
433 ]
434 ,
435 "eval":{
436 "matcher": {
437 "score": 193077
438 }
439 }
440 }
441 */
442
443 // do not take into account peptide containing too much redundancy
445 {
446 // qDebug() << "peptide_key=" << peptide_key;
447 new_cbor_psm.insert(QString("proforma"), peptide_key);
448
449 // copy protein list in new psm
450 new_cbor_psm.insert(QString("protein_list"), old_cbor_psm.value("protein_list"));
451
452 fixPositionStart(is_reverse, new_cbor_psm, alignment.getPositionStart());
453
454 QCborMap cbor_eval;
455
456 QCborMap cbor_peptidoms;
457
458 // auto it_protein_pos =
459 // one_peptide_result.map_protein2positions.insert({accession, QCborArray()});
460 // it_protein_pos.first->second.append((qint64)alignment.getPositionStart());
461
462 cbor_peptidoms.insert(QString("bracket"), alignment.m_peptideModel.toInterpretation());
463 cbor_peptidoms.insert(QString("spc"), (qint64)alignment.SPC);
464 cbor_peptidoms.insert(QString("score"), alignment.score);
465 cbor_peptidoms.insert(QString("nam"), alignment.getNonAlignedMass());
466
467
468 // copy matcher eval in new psm
469 cbor_eval.insert(QString("matcher"), old_cbor_psm.value("eval").toMap().value("matcher"));
470
471
472 // store peptidoms eval in new psm
473 cbor_eval.insert(QString("peptidoms"), cbor_peptidoms);
474
475 new_cbor_psm.insert(QString("eval"), cbor_eval);
476 }
477 }
478}
479void
481 QCborMap &new_cbor_psm,
482 std::size_t offset_position) const
483{
484 QCborArray new_protein_list;
485 for(auto qcbor_protein : new_cbor_psm.value("protein_list").toArray())
486 {
487 QCborMap protein;
488
489 QCborArray positions;
490 if(is_reverse)
491 {
492 protein.insert(QString("accession"),
493 m_decoyPrefix + qcbor_protein.toMap().value("accession").toString());
494 for(auto position : qcbor_protein.toMap().value("positions").toArray())
495 {
496 positions.append(position.toInteger() + (qint64)offset_position);
497 }
498 }
499 else
500 {
501
502 protein.insert(QString("accession"), qcbor_protein.toMap().value("accession"));
503 for(auto position : qcbor_protein.toMap().value("positions").toArray())
504 {
505 positions.append(position.toInteger() + (qint64)offset_position);
506 }
507 }
508
509 protein.insert(QString("positions"), positions.toCborValue());
510
511 new_protein_list.append(protein);
512 }
513 new_cbor_psm.insert(QString("protein_list"), new_protein_list);
514}
collection of integer code for each amino acid 0 => null 1 to 20 => amino acid sorted by there mass (...
Definition aacode.h:44
void addAaModification(char aa_letter, AaModificationP aaModification)
add a modification on an amino acid for example carbamido on C
Definition aacode.cpp:168
static AaModificationP getInstance(const QString &accession)
Trace & filter(Trace &data_points) const override
get all the datapoints and remove different isotope and add their intensity and change to charge = 1 ...
keep N datapoints form the greatest intensities to the lowest
Definition filterpass.h:96
Trace & filter(Trace &data_points) const override
Trace & filter(Trace &trace) const override
Class to represent a mass spectrum.
static PeptideSp parseString(const QString &pepstr)
static PrecisionPtr getPpmInstance(pappso_double value)
get a ppm precision pointer
static PrecisionPtr getDaltonInstance(pappso_double value)
get a Dalton precision pointer
virtual void setStatus(const QString &status)=0
current status of the process
Basic PSM file reader to process scan (parallelized scan processing) and write a new resulting PSM fi...
virtual void proteinMapReady(pappso::UiMonitorInterface &monitor) override
virtual void processBufferScanDone(pappso::UiMonitorInterface &monitor) override
void sequenceAlignment(bool is_reverse, const QCborMap &old_cbor_psm_map, QCborArray &new_psm_arr, pappso::specpeptidoms::SpOMSSpectrumCsp &experimental_spectrum, pappso::specpeptidoms::SemiGlobalAlignment &semi_global_alignment, const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
void storeAlignment(bool is_reverse, const QCborMap &old_cbor_psm, QCborMap &new_cbor_psm, const QString &accession, const pappso::specpeptidoms::Alignment &alignment)
void fixPositionStart(bool is_reverse, QCborMap &new_cbor_psm, std::size_t offset_position) const
const PsmSpecPeptidOms * mp_psmSpecPeptidOms
PsmSpecPeptidOmsScan(const PsmSpecPeptidOms &psm_specpeptidoms, pappso::PrecisionPtr fragment_tolerance)
void parameterMapReady(pappso::UiMonitorInterface &monitor) override
virtual void proteinMapReady(pappso::UiMonitorInterface &monitor) override
PsmSpecPeptidOms(std::size_t buffer_scan_size, CborStreamWriter *cbor_output_p, const QJsonObject &parameters)
const pappso::AaCode & getAaCode() const
void filterMassSpectrum(pappso::MassSpectrum &mass_spectrum) const
CborScanMapBase * newCborScanMap() override
virtual void processBufferScanDone(pappso::UiMonitorInterface &monitor) override
std::vector< Location > getLocations() const
Returns a vector containing the saved locations.
const Alignment & getBestAlignment() const
Returns a const ref to m_best_alignment.
void postProcessingAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr, std::size_t beginning, std::size_t length, const std::vector< double > &shifts)
performs the post-processing : generates corrected spectra and align them
void preciseAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr, const std::size_t beginning, const std::size_t length)
performs the second alignment search between a protein subsequence and a spectrum.
void fastAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr)
perform the first alignment search between a protein sequence and a spectrum. The member location hea...
static bool checkSequenceDiversity(const QString &sequence, std::size_t window, std::size_t minimum_aa_diversity)
check that the sequence has a minimum of amino acid checkSequenceDiversity
static std::vector< double > getPotentialMassErrors(const pappso::AaCode &aa_code, const Alignment &alignment, const QString &protein_seq)
Returns a list of the potential mass errors corresponding to the provided alignment in the provided p...
LocationSaver getLocationSaver() const
Returns a copy of m_location_saver.
const QString & getSequence() const
std::shared_ptr< const SpOMSSpectrum > SpOMSSpectrumCsp
std::shared_ptr< QualifiedMassSpectrum > QualifiedMassSpectrumSPtr
std::shared_ptr< const Peptide > PeptideSp
std::shared_ptr< Protein > protein_sp
double getNonAlignedMass() const
convenient function to get the remaining non explained mass shift
std::size_t getPositionStart() const
get position of start on the protein sequence