libpappsomspp
Library for mass spectrometry
Loading...
Searching...
No Matches
semiglobalalignment.h
Go to the documentation of this file.
1/**
2 * \file pappsomspp/processing/specpeptidoms/semiglobalalignment.h
3 * \date 24/03/2025
4 * \author Aurélien Berthier
5 * \brief protein to spectrum alignment
6 *
7 * C++ implementation of the SpecPeptidOMS algorithm described in :
8 * (1) Benoist, É.; Jean, G.; Rogniaux, H.; Fertin, G.; Tessier, D. SpecPeptidOMS Directly and
9 * Rapidly Aligns Mass Spectra on Whole Proteomes and Identifies Peptides That Are Not Necessarily
10 * Tryptic: Implications for Peptidomics. J. Proteome Res. 2025.
11 * https://doi.org/10.1021/acs.jproteome.4c00870.
12 */
13
14/*
15 * Copyright (c) 2025 Aurélien Berthier
16 * <aurelien.berthier@ls2n.fr>
17 *
18 * This program is free software: you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation, either version 3 of the License, or
21 * (at your option) any later version.
22 *
23 * This program is distributed in the hope that it will be useful,
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 * GNU General Public License for more details.
27 *
28 * You should have received a copy of the GNU General Public License
29 * along with this program. If not, see <http://www.gnu.org/licenses/>.
30 */
31
32#pragma once
33
34#include "spomsspectrum.h"
36#include "scorevalues.h"
37#include "locationsaver.h"
38#include "scenario.h"
39#include "peptidemodel.h"
40#include "spomsprotein.h"
42
43namespace pappso
44{
45namespace specpeptidoms
46{
47
48struct KeyCell
49{
50 std::size_t n_row;
51 int score;
52 std::size_t beginning;
54};
55
57{
58 /** @brief reinitialize to default score_values
59 */
60 void reset();
61 /** @brief convenient function to get peptide sequence from location
62 */
63 QString getPeptideString(const QString &protein_sequence) const;
64
65
66 /** @brief convenient function to get the remaining non explained mass shift
67 *
68 * non explained mass delta between the peptide chemical formula and the observed experimental
69 * spectrum precursor
70 */
71 double getNonAlignedMass() const;
72
73 /** @brief get position of start on the protein sequence
74 */
75 std::size_t getPositionStart() const;
76
77 std::vector<std::size_t> peaks; // List of the spectrum's peaks used by the alignment
78 PeptideModel m_peptideModel; // Peptide model representing the alignment, with mass shifts and
79 // sequence shifts
80 int score = 0; // Final alignment score
81 double begin_shift = 0.0, // Shift between the spectrum's first peak (at 19.02 Da) and the
82 // alignment's last peak (first in N->C order).
83 end_shift = 0.0; // Missing mass between the alignment's total mass (including begin_shift) and
84 // the spectrum precursor mass.
85 std::vector<double> shifts; // List of mass shifts present in the alignment
86 std::size_t SPC = 0, // SPC : Shared Peak Count
87 beginning = 0, // Localization of the alignment's first amino acid in the peptide sequence
88 end = 0; // Localization of the alignment's last amino acid in the peptide sequence
89};
90
92{
93 public:
94 /**
95 * Default constructor
96 */
97 SemiGlobalAlignment(const ScoreValues &score_values,
98 const pappso::PrecisionPtr precision_ptr,
99 const AaCode &aaCode);
100
101 /**
102 * Destructor
103 */
105
106 /**
107 * @brief perform the first alignment search between a protein sequence and a spectrum. The member
108 * location heap is filled with the candidates locations.
109 * @param spectrum Spectrum to align
110 * @param protein_ptr Protein pointer on the sequence to align.
111 */
112 void fastAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr);
113
114 /**
115 * @brief performs the second alignment search between a protein subsequence and a spectrum.
116 * @param spectrum Spectrum to align
117 * @param protein_ptr Protein pointer on the sequence to align.
118 * @param beginning Index of the beginning of the subsequence to align.
119 * @param length Length of the subsequence to align.
120 */
121 void preciseAlign(const SpOMSSpectrum &spectrum,
122 const SpOMSProtein *protein_ptr,
123 const std::size_t beginning,
124 const std::size_t length);
125
126 /**
127 * @brief performs the post-processing : generates corrected spectra and align them
128 * @param spectrum Spectrum to align
129 * @param protein_ptr Protein pointer on the sequence to align.
130 * @param beginning Index of the beginning of the subsequence to align.
131 * @param length Length of the subsequence to align.
132 * @param shifts List of potential precursor mass errors to test.
133 */
134 void postProcessingAlign(const SpOMSSpectrum &spectrum,
135 const SpOMSProtein *protein_ptr,
136 std::size_t beginning,
137 std::size_t length,
138 const std::vector<double> &shifts);
139
140 /**
141 * @brief Returns a copy of m_location_saver.
142 */
144
145 /**
146 * @brief Returns a copy of m_scenario.
147 */
148 Scenario getScenario() const;
149
150 /**
151 * @brief Returns a const ref to m_best_alignment.
152 */
153 const Alignment &getBestAlignment() const;
154
155
156 /** @brief convenient function for degub purpose
157 */
158 const std::vector<KeyCell> &getInterestCells() const;
159
160
161 /**
162 * @brief Returns a list of the potential mass errors corresponding to the provided alignment in
163 * the provided protein sequence.
164 * @param aa_code the amino acid code of reference to get aminon acid masses
165 * @param alignment Alignment for which to get the potential mass errors.
166 * @param protein_seq Protein sequence corresponding to the provided alignment.
167 */
168 static std::vector<double> getPotentialMassErrors(const pappso::AaCode &aa_code,
169 const Alignment &alignment,
170 const QString &protein_seq);
171
172 /** @brief check that the sequence has a minimum of amino acid checkSequenceDiversity
173 * @param sequence protein sequence
174 * @param window the size of substring to check
175 * @param minimum_aa_diversity minimum number of different amino acid in this window
176 */
177 static bool checkSequenceDiversity(const QString &sequence,
178 std::size_t window,
179 std::size_t minimum_aa_diversity);
180
181
182 /** @brief function made for testing the fastAlign process, process one line and return the
183 * alignment matrix
184 */
185 const std::vector<KeyCell> &oneAlignStep(const pappso::specpeptidoms::SpOMSProtein &sequence,
186 const std::size_t row_number,
187 const std::vector<AaPosition> &aa_positions,
188 const SpOMSSpectrum &spectrum,
189 const bool fast_align,
190 const pappso::specpeptidoms::SpOMSProtein *protein_ptr);
191
192 /** @brief function made for testing the fastAlign process, initiate the variables for alignment
193 */
194 void initFastAlign(const SpOMSSpectrum &spectrum);
195
196 /** @brief function made for testing the preciseAlign process, initiate the variables for
197 * alignment
198 */
199 void initpreciseAlign(const SpOMSSpectrum &spectrum, std::size_t length2);
200
201 private:
202 /**
203 * @brief Stores the best alignment from m_scenario in m_best_alignment
204 * @param sequence reversed sequence of the current alignment.
205 * @param spectrum Spectrum currently being aligned.
206 * @param offset Size of the protein sequence minus beginning of the alignment. Used to compute
207 * the position of the alignment in the protein sequence.
208 */
209 void saveBestAlignment(const SpOMSProtein &sequence,
210 const SpOMSSpectrum &spectrum,
211 std::size_t offset);
212
213 /**
214 * @brief Recursively performs the correction of the alignment.
215 * @param protein_seq Protein reversed sequence to align.
216 * @param protein_ptr Protein pointer on the sequence to align.
217 * @param spectrum Spectrum to align.
218 * @param peaks_to_remove Peaks to remove from the spectrum.
219 * @param offset Size of the protein sequence minus beginning of the alignment. Used to compute
220 * the position of the alignment in the protein sequence.
221 */
222 void correctAlign(const SpOMSProtein &protein_subseq,
223 const SpOMSProtein *protein_ptr,
224 const SpOMSSpectrum &spectrum,
225 std::vector<std::size_t> &peaks_to_remove,
226 std::size_t offset);
227
228 /**
229 * @brief updates the scores of the alignment matrix for a given amino acid as well as the
230 * location heap/scenario.
231 * @param sequence Reversed sequence of the protein being aligned
232 * /!\ In case of a subsequence alignment, the provided protein must only contain the sequence to
233 * align.
234 * @param row_number number of the row to update (== index in sequence of the amino acid being
235 * aligned)
236 * @param aa_positions list of the AaPositions of the current amino acid
237 * @param spectrum Spectrum being aligned
238 * @param fast_align Whether to use the fast version of the algorithm (for 1st alignemnt step)
239 * @param protein_ptr Protein pointer on the sequence to align.
240 * /!\ In case of a subsequence alignment, the provided protein must be the protein of origin
241 * (i.e. complete sequence).
242 */
244 const std::size_t row_number,
245 const std::vector<AaPosition> &aa_positions,
246 const SpOMSSpectrum &spectrum,
247 const bool fast_align,
248 const pappso::specpeptidoms::SpOMSProtein *protein_ptr);
249
250 /**
251 * @brief indicates if a perfect shift is possible between the provided positions
252 * @param sequence Reversed sequence of the protein being aligned
253 * @param spectrum Spectrum being aligned
254 * @param origin_row beginning row of the aa gap to verify (== index of the first missing aa in
255 * sequence)
256 * @param current_row row being processed (== index of the current AaPosition in sequence)
257 * @param l_peak left peak index of the mz gap to verify
258 * @param r_peak right peak index of the mz gap to verify
259 */
261 const SpOMSSpectrum &spectrum,
262 const std::size_t origin_row,
263 const std::size_t current_row,
264 const std::size_t l_peak,
265 const std::size_t r_peak) const;
266
267 /**
268 * @brief indicates if a perfect shift is possible from the spectrum beginning to the provided
269 * peak. Returns the perfect shift origin if the shift is possible, otherwise returns the current
270 * row.
271 * @param sequence Reversed sequence of the protein being aligned
272 * @param spectrum Spectrum being aligned
273 * @param current_row row being processed (== index of the current AaPosition in sequence)
274 * @param r_peak right peak index of the mz gap to verify
275 */
277 const SpOMSSpectrum &spectrum,
278 const std::size_t current_row,
279 const std::size_t r_peak) const;
280
281 /**
282 * @brief indicates if a perfect shift is possible between the provided positions
283 * @param sequence Reversed sequence of the protein being aligned
284 * @param spectrum Spectrum being aligned
285 * @param end_row Index of the last aligned row.
286 * @param end_peak Index of the last aligned peak.
287 */
289 const SpOMSSpectrum &spectrum,
290 std::size_t end_row,
291 std::size_t end_peak) const;
292
293
294 private:
295 std::vector<KeyCell> m_interest_cells;
296 std::vector<std::pair<std::size_t, KeyCell>> m_updated_cells;
298 const int min_score = 15;
306};
307} // namespace specpeptidoms
308} // namespace pappso
collection of integer code for each amino acid 0 => null 1 to 20 => amino acid sorted by there mass (...
Definition aacode.h:44
modelize peptide sequence to facilitate rendering in bracket or proforma
void initpreciseAlign(const SpOMSSpectrum &spectrum, std::size_t length2)
function made for testing the preciseAlign process, initiate the variables for alignment
const Alignment & getBestAlignment() const
Returns a const ref to m_best_alignment.
Scenario getScenario() const
Returns a copy of m_scenario.
std::size_t perfectShiftPossibleEnd(const pappso::specpeptidoms::SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, std::size_t end_row, std::size_t end_peak) const
indicates if a perfect shift is possible between the provided positions
void updateAlignmentMatrix(const pappso::specpeptidoms::SpOMSProtein &sequence, const std::size_t row_number, const std::vector< AaPosition > &aa_positions, const SpOMSSpectrum &spectrum, const bool fast_align, const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
updates the scores of the alignment matrix for a given amino acid as well as the location heap/scenar...
void postProcessingAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr, std::size_t beginning, std::size_t length, const std::vector< double > &shifts)
performs the post-processing : generates corrected spectra and align them
void preciseAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr, const std::size_t beginning, const std::size_t length)
performs the second alignment search between a protein subsequence and a spectrum.
void correctAlign(const SpOMSProtein &protein_subseq, const SpOMSProtein *protein_ptr, const SpOMSSpectrum &spectrum, std::vector< std::size_t > &peaks_to_remove, std::size_t offset)
Recursively performs the correction of the alignment.
const std::vector< KeyCell > & oneAlignStep(const pappso::specpeptidoms::SpOMSProtein &sequence, const std::size_t row_number, const std::vector< AaPosition > &aa_positions, const SpOMSSpectrum &spectrum, const bool fast_align, const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
function made for testing the fastAlign process, process one line and return the alignment matrix
const std::vector< KeyCell > & getInterestCells() const
convenient function for degub purpose
bool perfectShiftPossible(const pappso::specpeptidoms::SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, const std::size_t origin_row, const std::size_t current_row, const std::size_t l_peak, const std::size_t r_peak) const
indicates if a perfect shift is possible between the provided positions
std::vector< std::pair< std::size_t, KeyCell > > m_updated_cells
void fastAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr)
perform the first alignment search between a protein sequence and a spectrum. The member location hea...
static bool checkSequenceDiversity(const QString &sequence, std::size_t window, std::size_t minimum_aa_diversity)
check that the sequence has a minimum of amino acid checkSequenceDiversity
std::size_t perfectShiftPossibleFrom0(const pappso::specpeptidoms::SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, const std::size_t current_row, const std::size_t r_peak) const
indicates if a perfect shift is possible from the spectrum beginning to the provided peak....
void initFastAlign(const SpOMSSpectrum &spectrum)
function made for testing the fastAlign process, initiate the variables for alignment
static std::vector< double > getPotentialMassErrors(const pappso::AaCode &aa_code, const Alignment &alignment, const QString &protein_seq)
Returns a list of the potential mass errors corresponding to the provided alignment in the provided p...
void saveBestAlignment(const SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, std::size_t offset)
Stores the best alignment from m_scenario in m_best_alignment.
SemiGlobalAlignment(const ScoreValues &score_values, const pappso::PrecisionPtr precision_ptr, const AaCode &aaCode)
LocationSaver getLocationSaver() const
Returns a copy of m_location_saver.
#define PMSPP_LIB_DECL
tries to keep as much as possible monoisotopes, removing any possible C13 peaks and changes multichar...
Definition aa.cpp:39
const PrecisionBase * PrecisionPtr
Definition precision.h:122
void reset()
reinitialize to default score_values
QString getPeptideString(const QString &protein_sequence) const
convenient function to get peptide sequence from location
double getNonAlignedMass() const
convenient function to get the remaining non explained mass shift
std::vector< std::size_t > peaks
std::size_t getPositionStart() const
get position of start on the protein sequence