libpappsomspp
Library for mass spectrometry
Loading...
Searching...
No Matches
localsignaltonoiseestimator.cpp
Go to the documentation of this file.
1// Copyright 2026 Filippo Rusconi
2// Inspired by work by Lars Nilse in OpenMS
3
4
5/////////////////////// stdlib includes
6
7
8/////////////////////// Qt includes
9#include <QList>
10#include <QDebug>
11
12
13/////////////////////// pappsomspp includes
14
15
16/////////////////////// Local includes
18
19namespace pappso
20{
25
29
30// Calculate mean & stdev of intensities of a Trace
33 const TraceIterator &iter_first_data_point,
34 const TraceIterator &iter_last_data_point) const
35{
36 int size = 0;
37 // add up
38 double v = 0;
39 double m = 0;
40 TraceIterator run = iter_first_data_point;
41 while(run != iter_last_data_point)
42 {
43 m += (*run).y;
44 ++size;
45 ++run;
46 }
47 // average
48 m = m / size;
49
50 // determine variance
51 run = iter_first_data_point;
52 while(run != iter_last_data_point)
53 {
54 double tmp(m - (*run).y);
55 v += tmp * tmp;
56 ++run;
57 }
58 v = v / ((double)size); // divide by n
59
60 GaussianEstimateParams estimate_params = {m, v};
61 return estimate_params;
62}
63
64void
69
70// Calculate signal-to-noise values for all data points given, by using a
71// sliding window approach
72void
74{
75 // First element in the Trace
76 TraceIterator iter_first_data_point = trace.begin();
77 // Last element in the Trace
78 TraceIterator iter_last_data_point = trace.end();
79
80 // Reset counter for sparse windows
82 // Reset counter for histogram overflow
84
85 // Reset the results
87 m_signalToNoiseEstimates.resize(trace.size());
88
89 // Maximal range of histogram needs to be calculated first
90 if(m_parameters.maxIntMode == AUTO_MAX_BY_STDEV)
91 {
92 // Use MEAN+auto_m_parameters.maxIntensity*STDEV as threshold
93 GaussianEstimateParams gaussian_estimate_global =
94 estimateGaussian(iter_first_data_point, iter_last_data_point);
95 m_parameters.maxIntensity =
96 gaussian_estimate_global.mean +
97 std::sqrt(gaussian_estimate_global.variance) * m_parameters.maxIntStdDevFactor;
98
99 // qDebug() << "Computed max intensity:" << m_parameters.maxIntensity;
100 }
101 else if(m_parameters.maxIntMode == AUTO_MAX_BY_PERCENTILE)
102 {
103 // get value at "m_parameters.maxIntPercentile"th percentile
104 // we use a histogram approach here as well.
105 if((m_parameters.maxIntPercentile < 0) || (m_parameters.maxIntPercentile > 100))
106 {
107 qFatal() << "The m_parameters.maxIntPercentile value is not in the [0, 100] range:"
108 << m_parameters.maxIntPercentile;
109 }
110
111 // Histogram with 100 percent values all initialized to 0.
112 QList<int> histogram_of_intensity_distribution(100, 0);
113
114 // Find maximum of current scan
115 auto max_intensity_data_point_iterator = std::max_element(
116 trace.begin(), trace.end(), [](const DataPoint &a, const DataPoint &b) {
117 return a.y > b.y;
118 });
119
120 double max_intensity = max_intensity_data_point_iterator->y;
121
122 // qDebug() << "Max intensity value in the trace:" << max_intensity;
123
124 double histogram_bin_size = max_intensity / 100;
125
126 // Fill histogram
127 for(const DataPoint &data_point : trace)
128 {
129 // Increment count at histogram bin that is the bin corresponding
130 // to the iterated data point intensity in the percentage scale above.
131 int bin = (int)((data_point.y - 1) / histogram_bin_size);
132 ++histogram_of_intensity_distribution[bin];
133 }
134
135 // Add up element counts in histogram until ?th percentile is reached
136 // For example if percentile is 95 percent, then the value elements_below_max_int_percentile
137 // below is the number of data points that correspond to 95 % of the whole number of data
138 // points in the spectrum.
139 int elements_below_max_int_percentile =
140 (int)(m_parameters.maxIntPercentile * trace.size() / 100);
141
142 int elements_seen = 0;
143 int i = -1;
144
145 TraceIterator the_iterator = iter_first_data_point;
146
147 // This loop is equivalent to iterating in the bins of the histogram,
148 // and for each bin sum the corresponding count to elements_seen.
149 // When elements_seen reaches 95% of the number of DataPoint instances
150 // in the mass spectrum, stop and record the bin number (i).
151 while(the_iterator != iter_last_data_point &&
152 elements_seen < elements_below_max_int_percentile)
153 {
154 ++i;
155 elements_seen += histogram_of_intensity_distribution[i];
156 ++the_iterator;
157 }
158
159 // Because each histogram_bin_size accounts for a hundredth of the max intensity of the
160 // whole mass spectra data, returning the i * histogram_bin_size is equivalent to
161 // returning the greatest intensity value encountered in the 95% of the mass spectrum.
162 // Half a bin is added to center the returned value in the bin.
163 m_parameters.maxIntensity = (((double)i) + 0.5) * histogram_bin_size;
164
165 // qDebug() << "Computed max intensity:" << m_parameters.maxIntensity;
166 }
167 else // if (m_parameters.maxIntMode == MANUAL_BY_VALUE)
168 {
169 if(m_parameters.maxIntensity <= 0)
170 {
171 qFatal() << "The method to determine the maximal intensity is set to MANUAL"
172 << "However, the m_parameters.maxIntensity value is <= 0.";
173 }
174
175 // qDebug() << "The max intensity was set manually:" << m_parameters.maxIntensity;
176 }
177
178 if(m_parameters.maxIntensity < 0)
179 {
180 qFatal() << "The m_parameters.maxIntensity "
181 "value should be positive! ";
182 }
183
184 TraceIterator window_pos_center = iter_first_data_point;
185 TraceIterator window_pos_border_left = iter_first_data_point;
186 TraceIterator window_pos_border_right = iter_first_data_point;
187
188 double window_half_size = m_parameters.windowSize / 2;
189 // At least size of 1 for intensity bins
190 double bin_size = std::max(1.0, m_parameters.maxIntensity / m_parameters.binCount);
191 int bin_count_minus_one = m_parameters.binCount - 1;
192
193 // Allocate the histogram with the user-configured number of bins
194 std::vector<int> histogram(m_parameters.binCount, 0);
195
196 // This vector will store for each bin, the value in it.
197 std::vector<double> bin_values(m_parameters.binCount, 0);
198
199 // Calculate average intensity that is represented by a bin
200 for(int bin = 0; bin < m_parameters.binCount; bin++)
201 {
202 histogram[bin] = 0;
203 bin_values[bin] = (bin + 0.5) * bin_size;
204 }
205
206 // This will store the bin in which a datapoint would fall.
207 int to_bin = 0;
208
209 // index of bin where the median is located
210 int median_bin = 0;
211 // additive number of elements from left to x in histogram
212 int element_inc_count = 0;
213
214 // tracks elements in current window, which may vary because of unevenly
215 // spaced data
216 int elements_in_window = 0;
217 // number of windows
218 int window_count = 0;
219
220 // number of elements where we find the median
221 int element_in_window_half = 0;
222
223 double noise; // noise value of a datapoint
224
225 // Iterate in the mass spectrum and at each data point, determine the signal-to-noise ratio
226 // and store it. The signal-to-noise ratio is computed over a rolling window of
227 // a configured size that is centered on the iterated data point.
228 // At each data point in the mass spectrum (that is, at
229 // iterator window_pos_center), update the histogram data for the rolling window.
230 // The data in the histogram will serve as the basis to determine the noise.
231
232 while(window_pos_center != iter_last_data_point)
233 {
234 // Each time we iterate in a new mass spectrum DataPoint, that is, each
235 // time we increment window_pos_center, we need to shift the rolling window
236 // to center it at the new mass spectrum data point.
237
238 // Erase all elements from histogram that will leave the rolling window on the
239 // LEFT side
240 while((*window_pos_border_left).x <
241 (*window_pos_center).x - window_half_size)
242 {
243 to_bin = std::max(
244 std::min<int>((int)((*window_pos_border_left).y / bin_size),
245 bin_count_minus_one),
246 0);
247
248 // Remove the current window's left half's DataPoint from
249 // the histogram counts at bin corresponding to its intensity.
250 --histogram[to_bin];
251 // Remove the current window's left half's DataPoint from
252 // the total count of DataPoint instances in the rolling window.
253 --elements_in_window;
254 // Now go on and shift the rolling window's left border iterator one position
255 // to the right.
256 ++window_pos_border_left;
257
258 // Do this until the iterator of the left border of the rolling window points
259 // to a mass spectrum DataPoint that has its m/z value matching the
260 // current rolling window center iterator m/z value minus half of the
261 // rolling window width.
262 }
263
264 // Add all elements to histogram that will enter the rolling window on the RIGHT
265 // side
266 while((window_pos_border_right != iter_last_data_point) &&
267 ((*window_pos_border_right).x <=
268 (*window_pos_center).x + window_half_size))
269 {
270 // std::cerr << (*window_pos_border_right).y << " " <<
271 // bin_size << " " << m_parameters.binCountminus_1 << std::endl;
272 to_bin = std::max(
273 std::min<int>((int)((*window_pos_border_right).y / bin_size),
274 bin_count_minus_one),
275 0);
276 ++histogram[to_bin];
277 ++elements_in_window;
278 ++window_pos_border_right;
279 }
280
281 if(elements_in_window < m_parameters.minRequiredElements)
282 {
283 noise = m_parameters.noiseValueForEmptyWindow;
285 }
286 else
287 {
288 // Find bin i where ceil[elements_in_window/2] <= sum_c(0..i){
289 // histogram[c] }
290 median_bin = -1;
291 element_inc_count = 0;
292 element_in_window_half = (elements_in_window + 1) / 2;
293 while(median_bin < bin_count_minus_one &&
294 element_inc_count < element_in_window_half)
295 {
296 ++median_bin;
297 element_inc_count += histogram[median_bin];
298 }
299
300 // Increase the error count
301 if(median_bin == bin_count_minus_one)
302 {
304 }
305
306 // Just avoid division by 0
307 noise = std::max(1.0, bin_values[median_bin]);
308 }
309
310 // Store result
311 m_signalToNoiseEstimates[window_count] = (*window_pos_center).y / noise;
312 // qDebug() << "Added new signal to noise estimate:"
313 // << m_signalToNoiseEstimates.at(window_count);
314
315 // Advance the window center by one datapoint
316 ++window_pos_center;
317 ++window_count;
318
319 } // end while
320
321 // qDebug() << "Done filling in the signal to noise estimates, the are"
322 // << m_signalToNoiseEstimates.size() << "estimates.";
323
326 m_histogramOverflowPercentage * 100 / window_count;
327
328 // warn if percentage of sparse windows is above 20%
330 {
331 qWarning() << m_sparseWindowPercentage
332 << "% of all windows were sparse. You should consider increasing"
333 << "'m_parameters.windowSize' or decrease 'm_parameters.minRequiredElements'";
334 }
335
336 // warn if percentage of possibly wrong median estimates is above 1%
338 {
340 << "% of all Signal-to-Noise estimates are too high, because the "
341 "median was found in the rightmost histogram-bin. "
342 << "You should consider increasing 'm_parameters.maxIntensityy' (and maybe "
343 "'m_parameters.binCount' with it, to keep bin width reasonable)";
344 }
345}
346
347// Return to signal/noise estimate for date point @p index
348// @note you will get a warning to stderr if more than 20% of the
349// noise estimates used sparse windows
350double
352{
353 Q_ASSERT(index < m_signalToNoiseEstimates.size());
354 return m_signalToNoiseEstimates[index];
355}
356
357} // namespace pappso
LocalSignalToNoiseEstimator(const Parameters &parameters)
GaussianEstimateParams estimateGaussian(const TraceIterator &iter_first_data_point, const TraceIterator &iter_last_data_point) const
double getSignalToNoiseRatio(qsizetype index) const
A simple container of DataPoint instances.
Definition trace.h:152
tries to keep as much as possible monoisotopes, removing any possible C13 peaks and changes multichar...
Definition aa.cpp:39
std::vector< DataPoint >::const_iterator TraceIterator