CandidateSearch 1.1.2
Proof-of-concept implementation of a search engine that uses sparse matrix multiplication to identify the best peptide candidates for a given mass spectrum.
Loading...
Searching...
No Matches
MGFParser.cs
Go to the documentation of this file.
1using System.Globalization;
2using System.Text.RegularExpressions;
3
5{
6 public class AMassCentroid
7 {
8 public double Position { get; set; }
9 public double Intensity { get; set; }
10 public int Charge { get; set; }
11
13 {
14 return new AMassCentroid() { Position = this.Position, Intensity = this.Intensity, Charge = this.Charge };
15 }
16
17 public AMassCentroid Clone(int charge)
18 {
19 return new AMassCentroid() { Position = this.Position, Intensity = this.Intensity, Charge = charge };
20 }
21 }
22
23 public class Precursor
24 {
25 public double MOverZ { get; private set; }
26 public double UnChargedMass { get; private set; }
27 public double Intensity { get; set; }
28 public int Charge { get; private set; }
29 public int Rank { get; set; }
30
31 public void SetMassCharge(double mz, int charge, bool mono)
32 {
33 MOverZ = mz;
34 Charge = charge;
35 UnChargedMass = MSAMANDA_CHEMICALUTILS.ChemicalUtils.CalculateUnchargedMass(MOverZ, Charge, mono);
36 }
37 }
38
39 public class Spectrum
40 {
41 public int ScanNumber { get; set; }
42 public int SpectrumId { get; set; }
43 public double RT { get; set; }
44 public List<AMassCentroid> FragmentsPeaks { get; set; }
45 public Dictionary<int, double> ImmunePeaks { get; set; }
46 public SortedSet<double> ImmuneMasses { get; set; }
47 public Precursor Precursor { get; set; }
48
49 public Spectrum()
50 {
51 ScanNumber = 0;
52 RT = 0.0;
53 SpectrumId = 0;
54 Precursor = new Precursor();
55 }
56 }
57
58 public static class MGFParser
59 {
60 public static List<Spectrum> ParseNextSpectra(string filename)
61 {
62 List<Spectrum> spectra = new List<Spectrum>();
63 int nrOfReadSpectra = 0;
64 int nrOfSpectra = 0;
65 double mOverZ = 0.0;
66 int charge = 0;
67 Dictionary<int, double> peaks = new Dictionary<int, double>();
68 Spectrum currentSpectrum = null;
69 bool isCorrect = true;
70 string lastScannumber = string.Empty;
71 string title = "";
72 int _scanId = 0;
73 var SpectTitleMap = new Dictionary<int, string>();
74
75 try
76 {
77 using (StreamReader sr = new StreamReader(filename))
78 {
79 while (!sr.EndOfStream)
80 {
81 var line = sr.ReadLine();
82
83 if (line == null) continue;
84
85 // error in parsing spectrum, search for next one
86 if (!isCorrect && (!line.ToUpper().StartsWith("BEGIN IONS", StringComparison.Ordinal)))
87 {
88 continue;
89 }
90
91 if (line.ToUpper().StartsWith("BEGIN IONS", StringComparison.Ordinal))
92 {
93 if (currentSpectrum != null && isCorrect)
94 {
95 Console.WriteLine(
96 " Skipping spectrum with scannumber '" + currentSpectrum.ScanNumber +
97 "'. No end ions found."
98 );
99 }
100
101 ++nrOfSpectra;
102 isCorrect = true;
103 peaks = new Dictionary<int, double>();
104 mOverZ = 0.0;
105 charge = 0;
106 currentSpectrum = new Spectrum
107 {
108 FragmentsPeaks = new List<AMassCentroid>(),
109 ImmuneMasses = new SortedSet<double>(),
110 ImmunePeaks = new Dictionary<int, double>(),
111 SpectrumId = _scanId
112 };
113 }
114 else if (LineCanBeIgnored(line)) { }
115 else
116 {
117 if (currentSpectrum == null)
118 {
119 isCorrect = false;
120 Console.WriteLine(
121 " Skipping spectrum after scannumber '" + lastScannumber + "'. No begin ions found."
122 );
123 continue;
124 }
125
126 if (line.ToUpper().StartsWith("TITLE", StringComparison.Ordinal))
127 {
128 int inx = line.IndexOf("=");
129 title = line.Substring(inx + 1);
130 if (line.ToUpper().Contains("SCAN", StringComparison.Ordinal))
131 {
132 Match s = Regex.Match(line.ToUpper(), @"SCAN.?[:=\s]\s?([0-9]+)");
133 if (s.Success)
134 currentSpectrum.ScanNumber = Int32.Parse(s.Groups[1].Value);
135 }
136 else if (line.ToUpper().Contains("INDEX", StringComparison.Ordinal))
137 {
138 Match s = Regex.Match(line.ToUpper(), @"INDEX.?[:=\s]\s?([0-9]+)");
139 if (s.Success)
140 currentSpectrum.ScanNumber = Int32.Parse(s.Groups[1].Value);
141 }
142 }
143 else if (line.ToUpper().StartsWith("PEPMASS", StringComparison.Ordinal))
144 {
145 mOverZ = ParseMOverZ(line);
146 }
147 else if (line.ToUpper().StartsWith("CHARGE", StringComparison.Ordinal))
148 {
149 charge = ParseCharge(line);
150 }
151 else if (line.ToUpper().StartsWith("RTINSECONDS", StringComparison.Ordinal))
152 {
153 currentSpectrum.RT = ParseRt(line);
154 }
155 else if (line.ToUpper().StartsWith("SCANS", StringComparison.Ordinal))
156 {
157 currentSpectrum.ScanNumber = ParseScanNumber(line);
158 }
159 else if (line.ToUpper().StartsWith("END IONS", StringComparison.Ordinal))
160 {
161 if (currentSpectrum.ScanNumber == 0)
162 {
163 if (string.IsNullOrEmpty(title))
164 {
165 Console.WriteLine(
166 " Skipping spectrum after scannumber '" +
167 lastScannumber + "'. No title or scan number found."
168 );
169 isCorrect = false;
170 continue;
171 }
172
173 string[] titleArr = title.Split('.');
174 if (titleArr.Length > 3)
175 currentSpectrum.ScanNumber =
176 Int32.Parse(titleArr[titleArr.Length - 3]); // vorvorletztes item; => firstScan
177 }
178
179 if (currentSpectrum.FragmentsPeaks.Count == 0 || mOverZ == 0)
180 {
181 string text = title;
182 if (currentSpectrum.ScanNumber != 0)
183 text = currentSpectrum.ScanNumber.ToString();
184
185 if (mOverZ == 0)
186 {
187 Console.WriteLine(
188 " Skipping spectrum with scannumber '" + text + "'. No mass value found."
189 );
190 }
191 else
192 {
193 Console.WriteLine(
194 " Skipping spectrum with scannumber '" + text + "'. No peaks found."
195 );
196 }
197
198 isCorrect = false;
199 continue;
200 }
201
202 // masses and peaks not needed, new calculation from fragmentpeaks
203 // masses and peaks are overwritten in Spectrum.PrepareForSearch()
204
205 if (charge == 0)
206 {
207 foreach (int consideredCharge in new List<int>() { 2, 3, 4, 5, 6 })
208 {
209 var s = GenerateSpectrum(currentSpectrum.FragmentsPeaks,
210 currentSpectrum.ScanNumber, _scanId, currentSpectrum.RT, mOverZ,
211 consideredCharge);
212 spectra.Add(s);
213 _scanId++;
214 }
215 }
216 else
217 {
218 currentSpectrum.Precursor.SetMassCharge(mOverZ, charge, true);
219 spectra.Add(currentSpectrum);
220 SpectTitleMap.Add(_scanId, title.Trim());
221 _scanId++;
222 }
223
224
225 ++nrOfReadSpectra;
226 lastScannumber = currentSpectrum.ScanNumber.ToString();
227 title = string.Empty;
228 peaks = new Dictionary<int, double>();
229 mOverZ = 0.0;
230 charge = 0;
231 currentSpectrum = null;
232 }
233 // fragments
234 else
235 {
236 string[] parts = line.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
237 if (parts.Length == 1)
238 {
239 parts = line.Split(new[] { '\t' }, StringSplitOptions.RemoveEmptyEntries);
240 }
241
242 if (parts.Length == 2 || parts.Length == 3)
243 {
244 double mass;
245 double intensity;
246
247 try
248 {
249 mass = ParseMass(parts[0], line);
250 }
251 catch (Exception e)
252 {
253 Console.WriteLine(
254 " Skipping spectrum with scannumber '" +
255 currentSpectrum.ScanNumber + "'. Error in parsing mass of peak.");
256 Console.WriteLine(e.ToString());
257 isCorrect = false;
258 continue;
259 }
260
261 try
262 {
263 intensity = ParseIntensity(parts[1], line);
264 }
265 catch (Exception e)
266 {
267 Console.WriteLine(
268 " Skipping spectrum with scannumber '" +
269 currentSpectrum.ScanNumber + "'. Error in parsing intensity of peak.");
270 Console.WriteLine(e.ToString());
271 isCorrect = false;
272 continue;
273 }
274
275 int key = MSAMANDA_CHEMICALUTILS.ChemicalUtils.GetMassIndex(mass);
276 if (peaks.ContainsKey(key))
277 {
278 if (peaks[key] < intensity)
279 peaks[key] = intensity;
280 }
281 else
282 {
283 peaks.Add(key, intensity);
284 }
285
286 currentSpectrum.FragmentsPeaks.Add(
287 GenerateFragmentPeak(mass, peaks[key], charge));
288 }
289 else
290 {
291 Console.WriteLine(
292 " Skipping spectrum with scannumber '" +
293 currentSpectrum.ScanNumber + "'. Error in parsing peak.");
294 isCorrect = false;
295 }
296 }
297 }
298 }
299 }
300 }
301 catch (Exception ex)
302 {
303 Console.WriteLine("Error parsing mgf file at or after spectrum '" + title + "'.");
304 Console.WriteLine(ex.ToString());
305 spectra.Clear();
306 throw;
307 }
308
309 return spectra;
310 }
311
312 private static Spectrum GenerateSpectrum(List<AMassCentroid> fragmentsPeaks, int numb, int scanId, double rt, double mOverZ, int charge)
313 {
314 Spectrum s = new Spectrum
315 {
316 FragmentsPeaks = fragmentsPeaks,
317 ScanNumber = numb,
318 SpectrumId = scanId,
319 RT = rt,
320 ImmuneMasses = new SortedSet<double>(),
321 ImmunePeaks = new Dictionary<int, double>()
322 };
323 s.Precursor.SetMassCharge(mOverZ, charge, true);
324 return s;
325 }
326
327 public static AMassCentroid GenerateFragmentPeak(double position, double intensity, int charge)
328 {
329 AMassCentroid amass = new AMassCentroid
330 {
331 Position = position,
332 Intensity = intensity,
333 Charge = (short)charge
334 };
335
336 return amass;
337 }
338
339 private static double ParseIntensity(string replace, string line)
340 {
341 string ReplaceDecimalSeperator(string toReplace)
342 {
343 return toReplace.Replace(",", NumberFormatInfo.CurrentInfo.NumberDecimalSeparator, StringComparison.Ordinal)
344 .Replace(".", NumberFormatInfo.CurrentInfo.NumberDecimalSeparator, StringComparison.Ordinal);
345 }
346
347 string i = ReplaceDecimalSeperator(replace);
348 double intensity = Double.Parse(i);
349 return intensity;
350 }
351
352 private static double ParseMass(string replace, string line)
353 {
354 string ReplaceDecimalSeperator(string toReplace)
355 {
356 return toReplace.Replace(",", NumberFormatInfo.CurrentInfo.NumberDecimalSeparator, StringComparison.Ordinal)
357 .Replace(".", NumberFormatInfo.CurrentInfo.NumberDecimalSeparator, StringComparison.Ordinal);
358 }
359
360 string m = ReplaceDecimalSeperator(replace);
361 double mass = Double.Parse(m);
362 return mass;
363 }
364
365 private static int ParseCharge(string line)
366 {
367 int inx = line.IndexOf("+", StringComparison.Ordinal);
368 string m = line.Substring(7);
369 if (inx != -1)
370 m = line.Substring(7, inx - 7);
371 if (!String.IsNullOrEmpty(m))
372 {
373 return Int32.Parse(m);
374 }
375
376 return 0;
377 }
378
379 private static double ParseRt(string line)
380 {
381 string ReplaceDecimalSeperator(string toReplace)
382 {
383 return toReplace.Replace(",", NumberFormatInfo.CurrentInfo.NumberDecimalSeparator, StringComparison.Ordinal)
384 .Replace(".", NumberFormatInfo.CurrentInfo.NumberDecimalSeparator, StringComparison.Ordinal);
385 }
386
387 int inx = line.IndexOf("=", StringComparison.Ordinal);
388 string m = line.Substring(inx + 1);
389 m = ReplaceDecimalSeperator(m);
390
391 bool ok = Double.TryParse(m, out var rt);
392 if (!ok) throw new Exception("Error in parsing: " + line);
393
394 return rt;
395 }
396
397 private static int ParseScanNumber(string line)
398 {
399 int inx = line.IndexOf("=", StringComparison.Ordinal);
400 string m = line.Substring(inx + 1);
401 if (m.Contains("MSMS:", StringComparison.Ordinal))
402 {
403 int idxMSMS = m.IndexOf("MSMS:", StringComparison.Ordinal);
404 m = m.Substring(idxMSMS + 5);
405 }
406
407 return Int32.Parse(m.Trim());
408 }
409
410 private static double ParseMOverZ(string line)
411 {
412 string ReplaceDecimalSeperator(string toReplace)
413 {
414 return toReplace.Replace(",", NumberFormatInfo.CurrentInfo.NumberDecimalSeparator, StringComparison.Ordinal)
415 .Replace(".", NumberFormatInfo.CurrentInfo.NumberDecimalSeparator, StringComparison.Ordinal);
416 }
417
418 double GetDoubleFromString(string p)
419 {
420 if (string.IsNullOrEmpty(p)) return -1;
421
422 try
423 {
424 return p.Contains(".", StringComparison.Ordinal)
425 ? double.Parse(p, CultureInfo.InvariantCulture)
426 : double.Parse(ReplaceDecimalSeperator(p));
427 }
428 catch (Exception e)
429 {
430 Console.WriteLine(e.ToString());
431 return -1;
432 }
433 }
434
435 int inx = line.IndexOf(" ", StringComparison.Ordinal);
436 if (inx == -1)
437 {
438 inx = line.IndexOf("\t", StringComparison.Ordinal);
439 }
440 var m = (inx == -1) ? line.Substring(8) : line.Substring(8, inx - 8);
441
442 m = ReplaceDecimalSeperator(m);
443 var mOverZ = GetDoubleFromString(m);
444 return mOverZ;
445 }
446
447 private static bool LineCanBeIgnored(string line)
448 {
449 // line is not ignored
450 //BEGIN IONS, END IONS, TITLE, SCANS, RTINSECONDS, PEPMASS , numbers
451 if (string.IsNullOrWhiteSpace(line))
452 return true;
453
454 var text = line.TrimStart().ToUpper();
455 // comment lines
456 if (text.StartsWith("#", StringComparison.Ordinal))
457 return true;
458 if (text.StartsWith("!", StringComparison.Ordinal))
459 return true;
460 if (text.StartsWith(";", StringComparison.Ordinal))
461 return true;
462 if (text.StartsWith("/", StringComparison.Ordinal))
463 return true;
464 if (text.StartsWith("_", StringComparison.Ordinal))
465 return true;
466 // unused info in lines
467 if (text.StartsWith("MASS", StringComparison.Ordinal))
468 return true;
469 if (text.StartsWith("INSTRUMENT", StringComparison.Ordinal))
470 return true;
471 // new ignored values from http://www.matrixscience.com/help/data_file_help.html#RULES
472 if (text.StartsWith("ACCESSION", StringComparison.Ordinal))
473 return true;
474 if (text.StartsWith("CLE", StringComparison.Ordinal))
475 return true;
476 if (text.StartsWith("COM", StringComparison.Ordinal))
477 return true;
478 if (text.StartsWith("CUTOUT", StringComparison.Ordinal))
479 return true;
480 if (text.StartsWith("COMP", StringComparison.Ordinal))
481 return true;
482 if (text.StartsWith("DB", StringComparison.Ordinal))
483 return true;
484 if (text.StartsWith("DECOY", StringComparison.Ordinal))
485 return true;
486 if (text.StartsWith("ERRORTOLERANT", StringComparison.Ordinal))
487 return true;
488 if (text.StartsWith("ETAG", StringComparison.Ordinal))
489 return true;
490 if (text.StartsWith("FORMAT", StringComparison.Ordinal))
491 return true;
492 if (text.StartsWith("FRAMES", StringComparison.Ordinal))
493 return true;
494 if (text.StartsWith("IT_MODS", StringComparison.Ordinal))
495 return true;
496 if (text.StartsWith("ITOL", StringComparison.Ordinal))
497 return true;
498 if (text.StartsWith("ITOLU", StringComparison.Ordinal))
499 return true;
500 if (text.StartsWith("LIBRARY_SEARCH", StringComparison.Ordinal))
501 return true;
502 if (text.StartsWith("LOCUS", StringComparison.Ordinal))
503 return true;
504 if (text.StartsWith("MODS", StringComparison.Ordinal))
505 return true;
506 if (text.StartsWith("MULTI_SITE_MODS", StringComparison.Ordinal))
507 return true;
508 if (text.StartsWith("PEP_ISOTOPE_ERROR", StringComparison.Ordinal))
509 return true;
510 if (text.StartsWith("PFA", StringComparison.Ordinal))
511 return true;
512 // only used in .pks, .xml
513 if (text.StartsWith("PRECURSOR", StringComparison.Ordinal))
514 return true;
515 if (text.StartsWith("QUANTIFICATION", StringComparison.Ordinal))
516 return true;
517 if (text.StartsWith("RAWFILE", StringComparison.Ordinal))
518 return true;
519 if (text.StartsWith("RAWSCANS", StringComparison.Ordinal))
520 return true;
521 if (text.StartsWith("REPORT", StringComparison.Ordinal))
522 return true;
523 if (text.StartsWith("REPTYPE", StringComparison.Ordinal))
524 return true;
525 if (text.StartsWith("SEARCH", StringComparison.Ordinal))
526 return true;
527 if (text.StartsWith("SEG", StringComparison.Ordinal))
528 return true;
529 if (text.StartsWith("SEQ", StringComparison.Ordinal))
530 return true;
531 if (text.StartsWith("TAG", StringComparison.Ordinal))
532 return true;
533 if (text.StartsWith("TAXONOMY", StringComparison.Ordinal))
534 return true;
535 if (text.StartsWith("TOL", StringComparison.Ordinal))
536 return true;
537 // combines USER00 - USER12, USEREMAIL, USERNAME
538 if (text.StartsWith("USER", StringComparison.Ordinal))
539 return true;
540
541 return false;
542 }
543 }
544}
AMassCentroid Clone(int charge)
Definition MGFParser.cs:17
void SetMassCharge(double mz, int charge, bool mono)
Definition MGFParser.cs:31
List< AMassCentroid > FragmentsPeaks
Definition MGFParser.cs:44
Dictionary< int, double > ImmunePeaks
Definition MGFParser.cs:45
SortedSet< double > ImmuneMasses
Definition MGFParser.cs:46