libpappsomspp
Library for mass spectrometry
Loading...
Searching...
No Matches
semiglobalalignment.h
Go to the documentation of this file.
1/**
2 * \file pappsomspp/processing/specpeptidoms/semiglobalalignment.h
3 * \date 24/03/2025
4 * \author Aurélien Berthier
5 * \brief protein to spectrum alignment
6 *
7 * C++ implementation of the SpecPeptidOMS algorithm described in :
8 * (1) Benoist, É.; Jean, G.; Rogniaux, H.; Fertin, G.; Tessier, D. SpecPeptidOMS Directly and
9 * Rapidly Aligns Mass Spectra on Whole Proteomes and Identifies Peptides That Are Not Necessarily
10 * Tryptic: Implications for Peptidomics. J. Proteome Res. 2025.
11 * https://doi.org/10.1021/acs.jproteome.4c00870.
12 */
13
14/*
15 * Copyright (c) 2025 Aurélien Berthier
16 * <aurelien.berthier@ls2n.fr>
17 *
18 * This program is free software: you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation, either version 3 of the License, or
21 * (at your option) any later version.
22 *
23 * This program is distributed in the hope that it will be useful,
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 * GNU General Public License for more details.
27 *
28 * You should have received a copy of the GNU General Public License
29 * along with this program. If not, see <http://www.gnu.org/licenses/>.
30 */
31
32#pragma once
33
34#include "spomsspectrum.h"
35#include "../../protein/protein.h"
36#include "scorevalues.h"
37#include "locationsaver.h"
38#include "scenario.h"
39#include "peptidemodel.h"
40#include "spomsprotein.h"
41
42namespace pappso
43{
44namespace specpeptidoms
45{
46
47struct KeyCell
48{
49 std::size_t n_row;
50 int score;
51 std::size_t beginning;
53};
54
56{
57 /** @brief reinitialize to default score_values
58 */
59 void reset();
60 /** @brief convenient function to get peptide sequence from location
61 */
62 QString getPeptideString(const QString &protein_sequence) const;
63
64
65 /** @brief convenient function to get the remaining non explained mass shift
66 *
67 * non explained mass delta between the peptide chemical formula and the observed experimental
68 * spectrum precursor
69 */
70 double getNonAlignedMass() const;
71
72 /** @brief get position of start on the protein sequence
73 */
74 std::size_t getPositionStart() const;
75
76 std::vector<std::size_t> peaks; // List of the spectrum's peaks used by the alignment
77 PeptideModel m_peptideModel; // Peptide model representing the alignment, with mass shifts and
78 // sequence shifts
79 int score = 0; // Final alignment score
80 double begin_shift = 0.0, // Shift between the spectrum's first peak (at 19.02 Da) and the
81 // alignment's last peak (first in N->C order).
82 end_shift = 0.0; // Missing mass between the alignment's total mass (including begin_shift) and
83 // the spectrum precursor mass.
84 std::vector<double> shifts; // List of mass shifts present in the alignment
85 std::size_t SPC = 0, // SPC : Shared Peak Count
86 beginning = 0, // Localization of the alignment's first amino acid in the peptide sequence
87 end = 0; // Localization of the alignment's last amino acid in the peptide sequence
88};
89
91{
92 public:
93 /**
94 * Default constructor
95 */
96 SemiGlobalAlignment(const ScoreValues &score_values,
97 const pappso::PrecisionPtr precision_ptr,
98 const AaCode &aaCode);
99
100 /**
101 * Destructor
102 */
104
105 /**
106 * @brief perform the first alignment search between a protein sequence and a spectrum. The member
107 * location heap is filled with the candidates locations.
108 * @param spectrum Spectrum to align
109 * @param protein_ptr Protein pointer on the sequence to align.
110 */
111 void fastAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr);
112
113 /**
114 * @brief performs the second alignment search between a protein subsequence and a spectrum.
115 * @param spectrum Spectrum to align
116 * @param protein_ptr Protein pointer on the sequence to align.
117 * @param beginning Index of the beginning of the subsequence to align.
118 * @param length Length of the subsequence to align.
119 */
120 void preciseAlign(const SpOMSSpectrum &spectrum,
121 const SpOMSProtein *protein_ptr,
122 const std::size_t beginning,
123 const std::size_t length);
124
125 /**
126 * @brief performs the post-processing : generates corrected spectra and align them
127 * @param spectrum Spectrum to align
128 * @param protein_ptr Protein pointer on the sequence to align.
129 * @param beginning Index of the beginning of the subsequence to align.
130 * @param length Length of the subsequence to align.
131 * @param shifts List of potential precursor mass errors to test.
132 */
133 void postProcessingAlign(const SpOMSSpectrum &spectrum,
134 const SpOMSProtein *protein_ptr,
135 std::size_t beginning,
136 std::size_t length,
137 const std::vector<double> &shifts);
138
139 /**
140 * @brief Returns a copy of m_location_saver.
141 */
143
144 /**
145 * @brief Returns a copy of m_scenario.
146 */
147 Scenario getScenario() const;
148
149 /**
150 * @brief Returns a const ref to m_best_alignment.
151 */
152 const Alignment &getBestAlignment() const;
153
154
155 /** @brief convenient function for degub purpose
156 */
157 const std::vector<KeyCell> &getInterestCells() const;
158
159
160 /**
161 * @brief Returns a list of the potential mass errors corresponding to the provided alignment in
162 * the provided protein sequence.
163 * @param aa_code the amino acid code of reference to get aminon acid masses
164 * @param alignment Alignment for which to get the potential mass errors.
165 * @param protein_seq Protein sequence corresponding to the provided alignment.
166 */
167 static std::vector<double> getPotentialMassErrors(const pappso::AaCode &aa_code,
168 const Alignment &alignment,
169 const QString &protein_seq);
170
171 /** @brief check that the sequence has a minimum of amino acid checkSequenceDiversity
172 * @param sequence protein sequence
173 * @param window the size of substring to check
174 * @param minimum_aa_diversity minimum number of different amino acid in this window
175 */
176 static bool checkSequenceDiversity(const QString &sequence,
177 std::size_t window,
178 std::size_t minimum_aa_diversity);
179
180
181 private:
182 /**
183 * @brief Stores the best alignment from m_scenario in m_best_alignment
184 * @param sequence reversed sequence of the current alignment.
185 * @param spectrum Spectrum currently being aligned.
186 * @param offset Size of the protein sequence minus beginning of the alignment. Used to compute
187 * the position of the alignment in the protein sequence.
188 */
189 void saveBestAlignment(const SpOMSProtein &sequence,
190 const SpOMSSpectrum &spectrum,
191 std::size_t offset);
192
193 /**
194 * @brief Recursively performs the correction of the alignment.
195 * @param protein_seq Protein reversed sequence to align.
196 * @param protein_ptr Protein pointer on the sequence to align.
197 * @param spectrum Spectrum to align.
198 * @param peaks_to_remove Peaks to remove from the spectrum.
199 * @param offset Size of the protein sequence minus beginning of the alignment. Used to compute
200 * the position of the alignment in the protein sequence.
201 */
202 void correctAlign(const SpOMSProtein &protein_subseq,
203 const SpOMSProtein *protein_ptr,
204 const SpOMSSpectrum &spectrum,
205 std::vector<std::size_t> &peaks_to_remove,
206 std::size_t offset);
207
208 /**
209 * @brief updates the scores of the alignment matrix for a given amino acid as well as the
210 * location heap/scenario.
211 * @param sequence Reversed sequence of the protein being aligned
212 * @param row_number number of the row to update (== index in sequence of the amino acid being
213 * aligned)
214 * @param aa_positions list of the AaPositions of the current amino acid
215 * @param spectrum Spectrum being aligned
216 * @param fast_align Whether to use the fast version of the algorithm (for 1st alignemnt step)
217 * @param protein_ptr Protein pointer on the sequence to align.
218 */
220 const std::size_t row_number,
221 const std::vector<AaPosition> &aa_positions,
222 const SpOMSSpectrum &spectrum,
223 const bool fast_align,
224 const pappso::specpeptidoms::SpOMSProtein *protein_ptr);
225
226 /**
227 * @brief indicates if a perfect shift is possible between the provided positions
228 * @param sequence Reversed sequence of the protein being aligned
229 * @param spectrum Spectrum being aligned
230 * @param origin_row beginning row of the aa gap to verify (== index of the first missing aa in
231 * sequence)
232 * @param current_row row being processed (== index of the current AaPosition in sequence)
233 * @param l_peak left peak index of the mz gap to verify
234 * @param r_peak right peak index of the mz gap to verify
235 */
237 const SpOMSSpectrum &spectrum,
238 const std::size_t origin_row,
239 const std::size_t current_row,
240 const std::size_t l_peak,
241 const std::size_t r_peak) const;
242
243 /**
244 * @brief indicates if a perfect shift is possible from the spectrum beginning to the provided
245 * peak.
246 * @param sequence Reversed sequence of the protein being aligned
247 * @param spectrum Spectrum being aligned
248 * @param current_row row being processed (== index of the current AaPosition in sequence)
249 * @param r_peak right peak index of the mz gap to verify
250 */
252 const SpOMSSpectrum &spectrum,
253 const std::size_t current_row,
254 const std::size_t r_peak) const;
255
256 /**
257 * @brief indicates if a perfect shift is possible between the provided positions
258 * @param sequence Reversed sequence of the protein being aligned
259 * @param spectrum Spectrum being aligned
260 * @param end_row Index of the last aligned row.
261 * @param end_peak Index of the last aligned peak.
262 */
264 const SpOMSSpectrum &spectrum,
265 std::size_t end_row,
266 std::size_t end_peak) const;
267
268
269 private:
270 std::vector<KeyCell> m_interest_cells;
271 std::vector<std::pair<std::size_t, KeyCell>> m_updated_cells;
273 const int min_score = 15;
281};
282} // namespace specpeptidoms
283} // namespace pappso
collection of integer code for each amino acid 0 => null 1 to 20 => amino acid sorted by there mass (...
Definition aacode.h:44
modelize peptide sequence to facilitate rendering in bracket or proforma
const Alignment & getBestAlignment() const
Returns a const ref to m_best_alignment.
Scenario getScenario() const
Returns a copy of m_scenario.
std::size_t perfectShiftPossibleEnd(const pappso::specpeptidoms::SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, std::size_t end_row, std::size_t end_peak) const
indicates if a perfect shift is possible between the provided positions
void updateAlignmentMatrix(const pappso::specpeptidoms::SpOMSProtein &sequence, const std::size_t row_number, const std::vector< AaPosition > &aa_positions, const SpOMSSpectrum &spectrum, const bool fast_align, const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
updates the scores of the alignment matrix for a given amino acid as well as the location heap/scenar...
void postProcessingAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr, std::size_t beginning, std::size_t length, const std::vector< double > &shifts)
performs the post-processing : generates corrected spectra and align them
void preciseAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr, const std::size_t beginning, const std::size_t length)
performs the second alignment search between a protein subsequence and a spectrum.
void correctAlign(const SpOMSProtein &protein_subseq, const SpOMSProtein *protein_ptr, const SpOMSSpectrum &spectrum, std::vector< std::size_t > &peaks_to_remove, std::size_t offset)
Recursively performs the correction of the alignment.
const std::vector< KeyCell > & getInterestCells() const
convenient function for degub purpose
bool perfectShiftPossible(const pappso::specpeptidoms::SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, const std::size_t origin_row, const std::size_t current_row, const std::size_t l_peak, const std::size_t r_peak) const
indicates if a perfect shift is possible between the provided positions
std::vector< std::pair< std::size_t, KeyCell > > m_updated_cells
void fastAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr)
perform the first alignment search between a protein sequence and a spectrum. The member location hea...
static bool checkSequenceDiversity(const QString &sequence, std::size_t window, std::size_t minimum_aa_diversity)
check that the sequence has a minimum of amino acid checkSequenceDiversity
std::size_t perfectShiftPossibleFrom0(const pappso::specpeptidoms::SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, const std::size_t current_row, const std::size_t r_peak) const
indicates if a perfect shift is possible from the spectrum beginning to the provided peak.
static std::vector< double > getPotentialMassErrors(const pappso::AaCode &aa_code, const Alignment &alignment, const QString &protein_seq)
Returns a list of the potential mass errors corresponding to the provided alignment in the provided p...
void saveBestAlignment(const SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, std::size_t offset)
Stores the best alignment from m_scenario in m_best_alignment.
LocationSaver getLocationSaver() const
Returns a copy of m_location_saver.
tries to keep as much as possible monoisotopes, removing any possible C13 peaks and changes multichar...
Definition aa.cpp:39
void reset()
reinitialize to default score_values
QString getPeptideString(const QString &protein_sequence) const
convenient function to get peptide sequence from location
double getNonAlignedMass() const
convenient function to get the remaining non explained mass shift
std::vector< std::size_t > peaks
std::size_t getPositionStart() const
get position of start on the protein sequence