CandidateSearch 1.1.2
Proof-of-concept implementation of a search engine that uses sparse matrix multiplication to identify the best peptide candidates for a given mass spectrum.
Loading...
Searching...
No Matches
FASTAParser.cs
Go to the documentation of this file.
1using System.Collections.Concurrent;
2using System.Text.RegularExpressions;
3using System.Text;
4using MessagePack;
7
9{
10 public class FASTAParser
11 {
12 public static List<Peptide> DigestFasta(string fastaFileName,
13 Settings settings,
14 bool generateDecoys = false,
15 double coreUsage = 0.75)
16 {
17 var trypsin = new Enzyme();
18 trypsin.Name = "Trypsin";
19 trypsin.CleavageSites = "KR";
20 trypsin.CleavageInhibitors = "P";
21 trypsin.Specificity = Enzyme.CLEAVAGE_SPECIFICITY.FULL;
22 trypsin.Offset = 1;
23
24 var proteins = ReadInFasta(fastaFileName, false);
25
26 var peptides = DigestProteins(proteins,
27 trypsin,
28 settings,
29 false,
30 coreUsage);
31
32 if (generateDecoys)
33 {
34 var decoyProteins = new List<DBProtein>();
35 foreach (var protein in proteins)
36 {
37 var decoySequence = ReverseSequence(protein.Sequence);
38 var decoyProtein = new DBProtein(protein.DbProtRef.ProtIdentifier, -protein.DbProtRef.MappingId, decoySequence, true);
39 decoyProteins.Add(decoyProtein);
40 }
41
42 var decoyPeptides = DigestProteins(decoyProteins,
43 trypsin,
44 settings,
45 true,
46 coreUsage);
47
48 peptides.AddRange(decoyPeptides);
49 }
50
51 return peptides;
52 }
53
54 private static string ReverseSequence(string seq)
55 {
56 char[] array = seq.ToCharArray();
57 Array.Reverse(array);
58 return new String(array);
59 }
60
61 private static List<Peptide> DigestProteins(List<DBProtein> proteins,
62 Enzyme enzyme,
63 Settings settings,
64 bool isDecoy,
65 double coreUsage)
66 {
67 var opts = new ParallelOptions { MaxDegreeOfParallelism = (int) Math.Ceiling(Environment.ProcessorCount * coreUsage) };
68 var concurrentPeptideList = new ConcurrentBag<List<DBPeptide>>();
69 Parallel.ForEach(proteins, opts, (protein) => {
70 var digester = new ProteinDigester(enzyme, settings.MAX_CLEAVAGES, true, settings.MIN_PEP_LENGTH, settings.MAX_PEP_LENGTH, protein);
71 concurrentPeptideList.Add(digester.DigestProteinIntoList());
72 });
73
74 var peptideList = concurrentPeptideList.SelectMany(x => x).ToList();
75 var massToPeptides = new DigesterDB();
76 HelperMethods.MergeToDBDictionaries(peptideList, ref massToPeptides, opts);
77
78 var peptides = new List<Peptide>();
79 foreach (var item in massToPeptides.DbPeptidesDictMassKey)
80 {
81 var currentPeptides = item.Value;
82 foreach (var peptide in currentPeptides)
83 {
84 var peptidoforms = GetPeptidoforms(peptide, settings, isDecoy);
85 peptides.AddRange(peptidoforms);
86 }
87 }
88
89 return peptides;
90 }
91
92 private static List<Peptide> GetPeptidoforms(DBPeptide dbPeptide, Settings settings, bool isDecoy)
93 {
94 var peptides = new List<Peptide>();
95 var mods = new Dictionary<int, double>();
96
97 if (settings.FIXED_MODIFICATIONS.Count > 0)
98 {
99 for (int i = 0; i < dbPeptide.Sequence.Length; i++)
100 {
101 if (settings.FIXED_MODIFICATIONS.ContainsKey(dbPeptide.Sequence[i].ToString()))
102 {
103 mods.Add(i, settings.FIXED_MODIFICATIONS[dbPeptide.Sequence[i].ToString()]);
104 }
105 }
106 }
107
108 var peptide = new Peptide(dbPeptide.Sequence, dbPeptide.Mass, mods, settings, isDecoy);
109 peptides.Add(peptide);
110
111 if (settings.VARIABLE_MODIFICATIONS.Count > 0)
112 {
113 foreach (var modification in settings.VARIABLE_MODIFICATIONS)
114 {
115 addPeptidoformsForModification(peptides, modification, settings);
116 }
117 }
118
119 return peptides;
120 }
121
122 private static void addPeptidoformsForModification(List<Peptide> peptides,
123 KeyValuePair<string, double> modification,
124 Settings settings)
125 {
126 var peptidoforms = new List<Peptide>();
127
128 foreach (var peptide in peptides)
129 {
130 var possibleModificationSites = new List<int>();
131 for (int i = 0; i < peptide.sequence.Length; i++)
132 {
133 if (peptide.sequence[i].ToString() == modification.Key)
134 {
135 possibleModificationSites.Add(i);
136 }
137 }
138
139 var possibleCombinations = getAllPossibleCombinations(possibleModificationSites);
140
141 foreach (var combination in possibleCombinations)
142 {
143 var peptidoform = new Peptide(peptide.sequence,
144 peptide.mass,
145 new Dictionary<int, double>(),
146 settings,
147 peptide.isDecoy);
148
149 foreach (var mod in peptide.modifications)
150 {
151 peptidoform.addModification(mod.Key, mod.Value);
152 }
153
154
155 foreach (var position in combination)
156 {
157 peptidoform.addModification(position, modification.Value);
158 }
159
160 peptidoforms.Add(peptidoform);
161 }
162 }
163
164 peptides.AddRange(peptidoforms);
165 }
166
167 private static List<List<int>> getAllPossibleCombinations(List<int> possibleModificationSites)
168 {
169 var possibleCombinations = new List<List<int>>();
170
171 for (int i = 0; i < (1 << possibleModificationSites.Count); ++i)
172 {
173 var combination = new List<int>();
174 for (int j = 0; j < possibleModificationSites.Count; ++j)
175 {
176 if ((i & (1 << j)) != 0)
177 {
178 combination.Add(possibleModificationSites[j]);
179 }
180 }
181 possibleCombinations.Add(combination);
182 }
183
184 return possibleCombinations;
185 }
186
187 private static List<DBProtein> ReadInFasta(string fastaFileName, bool isDecoy)
188 {
189 List<DBProtein> proteins = new List<DBProtein>();
190 string regexPatternSequence = "[^ARNDCEQGHILKMFPSTUWYVXBZJO]";
191 int mappingID = 0;
192
193 StreamReader fastaFileReader = new StreamReader(fastaFileName);
194 try
195 {
196 string currentLine = "";
197 string sequence = "";
198 string identifier = "";
199
200 while ((currentLine = fastaFileReader.ReadLine()) != null)
201 {
202 if (currentLine.StartsWith(">", StringComparison.Ordinal))
203 {
204 if (!string.IsNullOrWhiteSpace(sequence))
205 {
206 //sequence = sequence.Replace('J', 'L');
207 if (Regex.IsMatch(sequence, regexPatternSequence))
208 {
209 var builder = new StringBuilder("Cannot parse ");
210 builder.Append("fasta file at identifier " + identifier + ". Sequence error. ");
211 Console.WriteLine(builder.ToString());
212 sequence = String.Empty;
213 identifier = string.Empty;
214 throw new Exception("Parsing error.");
215 }
216 proteins.Add(GenerateDbProtein(mappingID, isDecoy, identifier, sequence));
217 sequence = "";
218 mappingID++;
219 }
220
221 int index = currentLine.IndexOfAny(new[] { ' ', '|' }, 0);
222
223 if (index == -1)
224 {
225 identifier = currentLine.Substring(1);
226 }
227 else
228 {
229 identifier = currentLine.Substring(1, index - 1);
230 int indexNumber = identifier.IndexOfAny(new[]
231 {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'});
232 while (indexNumber == -1)
233 {
234 index = currentLine.IndexOfAny(new[] { ' ', '|' }, index + 1);
235 if (index == -1)
236 {
237 identifier = currentLine.Substring(1);
238 break;
239 }
240
241 identifier = currentLine.Substring(1, index - 1);
242 indexNumber = identifier.IndexOfAny(new[]
243 {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'});
244 }
245 }
246 }
247 else
248 {
249 sequence += currentLine.ToUpper();
250 }
251 }
252
253 if (!string.IsNullOrWhiteSpace(sequence))
254 {
255 if (Regex.IsMatch(sequence, regexPatternSequence))
256 {
257 var builder = new StringBuilder("Cannot parse ");
258 builder.Append("fasta file at identifier " + identifier + ". Sequence error. ");
259 Console.WriteLine(builder.ToString());
260 throw new Exception("Parsing error.");
261 }
262 else
263 {
264 proteins.Add(GenerateDbProtein(mappingID, isDecoy, identifier, sequence));
265 sequence = "";
266 mappingID++;
267 }
268 }
269 }
270 catch (Exception e)
271 {
272 Console.WriteLine("Fasta file error");
273 Console.WriteLine(e.ToString());
274 throw;
275 }
276 finally
277 {
278 fastaFileReader.Close();
279 }
280
281 return proteins;
282 }
283
284 private static DBProtein GenerateDbProtein(int mappingID, bool isDecoy, string identifier, string sequence)
285 {
286
287 if (isDecoy)
288 {
289 //mark proteins as decoys by negative ProteinIDs;
290 return new DBProtein(identifier, -mappingID, sequence);
291 }
292 return new DBProtein(identifier, mappingID, sequence);
293 }
294 }
295
296 public class DBProtein
297 {
298 public string Sequence { get; set; }
299 public bool IsDecoy { get; set; }
300 public DBProtRef DbProtRef { get; set; }
301
302 public DBProtein(string identifier, int id, string sequence, bool isDecoy = false)
303 {
304 var ok = int.TryParse(identifier, out int identy);
305 if (ok)
306 {
307 DbProtRef = new DBProtRef
308 {
309 ProtId = identy,
310 MappingId = id,
311 ProtIdentifier = identifier
312 };
313 }
314 else
315 {
316 DbProtRef = new DBProtRef
317 {
318 ProtId = id,
319 MappingId = id,
320 ProtIdentifier = identifier
321 };
322 }
323 Sequence = sequence;
324 IsDecoy = isDecoy;
325 }
326 }
327
328 [MessagePackObject]
329 public struct DBProtRef
330 {
331 [Key(0)]
332 public int ProtId { get; set; }
333 [Key(1)]
334 public int MappingId { get; set; }
335 [Key(2)]
336 public string ProtIdentifier { get; set; }
337
338 }
339
340 public class Enzyme
341 {
342 public Enzyme()
343 {
345 CleavageSites = "";
346 Offset = 0;
348 Name = "";
349 }
350
352
353 public string CleavageSites { get; set; }
354 public string CleavageInhibitors { get; set; }
356 public int Offset { get; set; }
357 public string Name { get; set; }
358
359 public Regex TheRegex => regexInit.Value;
360 private Lazy<Regex> regexInit => new Lazy<Regex>(() =>
361 {
362 if (Offset == 1)
363 {
364 //C-Terminal
365 StringBuilder builder = new StringBuilder("(?<=[");
366 builder.Append(CleavageSites);
367 builder.Append("])");
368 if (!String.IsNullOrEmpty(CleavageInhibitors))
369 {
370 builder.Append("(?=[^");
371 builder.Append(CleavageInhibitors);
372 builder.Append("])");
373 }
374 // "(?<=[" + Enzyme.CleavageSites + "])(?=[^" + Enzyme.CleavageInhibitors + "])"
375 return new Regex(builder.ToString());
376 }
377 else
378 {
379 //N-Terminal
380 StringBuilder builder = new StringBuilder("(?=[");
381 builder.Append(CleavageSites);
382 builder.Append("]");
383 if (!String.IsNullOrEmpty(CleavageInhibitors))
384 {
385 builder.Append("[^");
386 builder.Append(CleavageInhibitors);
387 builder.Append("]");
388 }
389 builder.Append(")");
390 // "(?=[" + Enzyme.CleavageSites + "][^" + Enzyme.CleavageInhibitors + "])"
391 return new Regex(builder.ToString());
392 }
393 });
394 }
395
396 [MessagePackObject]
397 public class DBPeptide
398 {
399 [Key(0)]
400 public double Mass { get; set; }
401 [Key(1)]
402 public int MassInt { get; set; }
403 [Key(2)]
404 public string Sequence { get; set; }
405 [Key(3)]
406 public bool ProteinStartFlag { get; set; }
407 [Key(4)]
408 public List<DBProtRef> DbProtRefs { get; set; }
409 [Key(5)]
410 public string SequenceOriginal { get; set; }
411 [Key(6)]
412 public int MissedCleavages { get; set; }
413 [IgnoreMember]
414 public int SeqHash { get; set; }
415
416 public DBPeptide()
417 {
418 Sequence = "";
419 DbProtRefs = new List<DBProtRef>();
420 SequenceOriginal = "";
421 }
422
423 public DBPeptide(string sequence, string sequenceOriginal, int missedCleavages, bool proteinStartFlag, DBProtRef protRef)
424 {
425 MissedCleavages = missedCleavages;
426 Sequence = sequence;
427 SequenceOriginal = sequenceOriginal;
428 ProteinStartFlag = proteinStartFlag;
429 SeqHash = CreateMD5();
430
431 DbProtRefs = new List<DBProtRef>(1) { protRef };
432 }
433
434 internal void AddToProtRefs(DBPeptide pep)
435 {
436 if (DbProtRefs != null && DbProtRefs.Count > 0)
437 {
438 DbProtRefs = DbProtRefs.Concat(pep.DbProtRefs).Distinct().ToList();
439 }
440 else
441 {
443 }
444 }
445
446 public int CreateMD5()
447 {
448 return (Sequence, MissedCleavages, ProteinStartFlag).GetHashCode();
449 }
450 }
451
452 public static class HelperMethods
453 {
454 public static DBPeptide FindPeptideWithSameHash(this List<DBPeptide> peptides, DBPeptide pep)
455 {
456 for (var i = 0; i < peptides.Count; i++)
457 {
458 var x = peptides[i];
459 if (x.SeqHash == pep.SeqHash)
460 {
461 return x;
462 }
463 }
464 return null;
465 }
466
467 public static bool IsBetweenExcludeBounds(this double target, double start, double end)
468 {
469 return target > start && target < end;
470 }
471
472 public static bool IsBetweenIncludeBounds(this int target, int start, int end)
473 {
474 return target >= start && target <= end;
475 }
476
477 public static void MergeToDBDictionaries(List<DBPeptide> dbFrom, ref DigesterDB dbTo, ParallelOptions parallelOptions)
478 {
479 dbTo.DbPeptidesDictMassKey = dbFrom.GroupBy(x => x.MassInt)
480 .AsParallel()
481 .WithDegreeOfParallelism(parallelOptions.MaxDegreeOfParallelism)
482 .Select(g => {
483 var theList = new List<DBPeptide>();
484 foreach (var pep in g)
485 {
486 var itemWithSameHash = theList.FindPeptideEqualTo(pep);
487 //var itemWithSameHash = theList.Find(x => DBPeptideEquals(x, pep));
488 if (itemWithSameHash != null)
489 {
490 itemWithSameHash.AddToProtRefs(pep);
491 }
492 else
493 {
494 theList.Add(pep);
495 }
496 }
497 return (g.Key, theList);
498 }).ToDictionary(t => t.Key, t => t.theList);
499 }
500
501 private static DBPeptide FindPeptideEqualTo(this List<DBPeptide> list, DBPeptide comparePep)
502 {
503 for (var i = 0; i < list.Count; i++)
504 {
505 var x = list[i];
506 if (DBPeptideEquals(comparePep, x))
507 {
508 return x;
509 }
510 }
511 return null;
512 }
513
514 private static bool DBPeptideEquals(DBPeptide x, DBPeptide pep)
515 {
516 if (x.SeqHash != pep.SeqHash)
517 {
518 return false;
519 }
520 if (x.ProteinStartFlag != pep.ProteinStartFlag)
521 {
522 return false;
523 }
524 if (x.MissedCleavages != pep.MissedCleavages)
525 {
526 return false;
527 }
528 if (x.Sequence != pep.Sequence)
529 {
530 return false;
531 }
532 return true;
533 }
534 }
535
536 public class ProteinDigester
537 {
538 private readonly Enzyme _enzyme;
539 private readonly int _missedCleavages;
540 private readonly bool _useMonoisotopicMass;
541 private readonly int _minPepLength;
542 private readonly int _maxPepLength;
543 private readonly DBProtein _dbProtein;
544
545 private List<DBPeptide> _dbPeptides;
546
547 struct PeptideInfo
548 {
549 public int Start;
550 public int MissedCleavages;
551 };
552
553 public ProteinDigester(Enzyme enzyme, int missedCleavages, bool useMonoisotopicMass, int minPepLength, int maxPepLength, DBProtein dbProtein)
554 {
555 _enzyme = enzyme;
556 _missedCleavages = missedCleavages;
557 _useMonoisotopicMass = useMonoisotopicMass;
558 _minPepLength = minPepLength;
559 _maxPepLength = maxPepLength;
560 _dbProtein = dbProtein;
561 _dbPeptides = new List<DBPeptide>();
562 }
563
564 public List<DBPeptide> DigestProteinIntoList()
565 {
566 _dbPeptides = new List<DBPeptide>();
567 DigestSingleProtein(_dbProtein);
568 return _dbPeptides;
569 }
570
571 private void DigestSingleProtein(DBProtein protein)
572 {
573
574 if (_enzyme.CleavageSites == "X")
575 {
576 //no-enzyme search
577 DigestSingleProteinWithNoEnzyme(protein);
578 }
579 else if (_enzyme.CleavageSites == "")
580 {
581 //no cleavage (peptide db or top down)
582 SaveSinglePeptide(new DBPeptide(protein.Sequence, protein.Sequence, 0, IsProteinStart(0, protein.Sequence[0]), protein.DbProtRef));
583 }
584 else
585 {
586 var peptides = SplitProtein(_enzyme, protein, _missedCleavages, 0);
587 foreach (var pep in peptides)
588 {
589 SaveSinglePeptide(pep);
590 }
591
592 if (protein.Sequence.StartsWith("M", StringComparison.Ordinal) && _enzyme.CleavageSites != "X" && _enzyme.CleavageSites != "")
593 {
594 var prot = new DBProtein(protein.DbProtRef.ProtIdentifier, protein.DbProtRef.MappingId, protein.Sequence.Substring(1));
595 var otherPeptides = SplitProtein(_enzyme, prot, _missedCleavages, 1);
596
597 foreach (var pep in otherPeptides)
598 {
599 var peptideWithSameHash = peptides.FindPeptideWithSameHash(pep);
600 if (peptideWithSameHash == null)
601 {
602 SaveSinglePeptide(pep);
603 continue;
604 }
605 if (peptideWithSameHash.MissedCleavages == pep.MissedCleavages) continue;
606
607 SaveSinglePeptide(pep);
608 }
609 }
610 }
611 }
612
613 private void DigestSingleProteinWithNoEnzyme(DBProtein protein)
614 {
615 SortedSet<int> positionsOfX = new SortedSet<int>();
616 SortedSet<int> positionsOfZ = new SortedSet<int>();
617 SortedSet<int> positionsOfB = new SortedSet<int>();
618 MatchCollection collX = Regex.Matches(protein.Sequence, "X");
619 MatchCollection collZ = Regex.Matches(protein.Sequence, "Z");
620 MatchCollection collB = Regex.Matches(protein.Sequence, "B");
621
622 for (int m = 0; m < collX.Count; ++m)
623 {
624 positionsOfX.Add(collX[m].Index);
625 }
626
627 for (int m = 0; m < collZ.Count; ++m)
628 {
629 positionsOfZ.Add(collZ[m].Index);
630 }
631
632 for (int m = 0; m < collB.Count; ++m)
633 {
634 positionsOfB.Add(collB[m].Index);
635 }
636
637 for (int s = 0; s < protein.Sequence.Length; ++s)
638 {
639 for (int l = 1; l < protein.Sequence.Length - s + 1; ++l)
640 {
641 var seqL = l - s;
642 if (seqL < _minPepLength)
643 continue;
644 if (seqL > _maxPepLength)
645 break;
646 if (l > 150)
647 break;
648 SortedSet<int> xInSeq = positionsOfX.GetViewBetween(s, s + l);
649 if (xInSeq.Count > 1)
650 break;
651 SortedSet<int> bInSeq = positionsOfB.GetViewBetween(s, s + l);
652 if (bInSeq.Count > 3)
653 break;
654 SortedSet<int> zInSeq = positionsOfZ.GetViewBetween(s, s + l);
655 if (zInSeq.Count > 3 || zInSeq.Count + bInSeq.Count > 3)
656 break;
657 if (xInSeq.Count > 0 || bInSeq.Count > 0 || zInSeq.Count > 0)
658 {
659 //handle replacement characters
660 SaveSinglePeptide(new DBPeptide(protein.Sequence.Substring(s, l), protein.Sequence.Substring(s, l), 0,
661 IsProteinStart(s, protein.Sequence[0]), protein.DbProtRef));
662 }
663 else
664 {
665 //save pep
666 string sseq = protein.Sequence.Substring(s, l);
667 double mass = ChemicalUtils.CalculatePeptideMass(sseq, _useMonoisotopicMass);
668
669 if (mass.IsBetweenExcludeBounds(200, 6000))
670 //if (!(mass > 200) || !(mass < 6000))
671 //{
672 // if (mass > 6000)
673 // break;
674 //}
675 //else
676 {
677 SavePeptide(new DBPeptide(sseq, sseq, 0, IsProteinStart(s, protein.Sequence[0]), protein.DbProtRef));
678 }
679 }
680 }
681 }
682 }
683
684 private void SaveSinglePeptide(DBPeptide pep)
685 {
686 if (String.IsNullOrEmpty(pep.Sequence))
687 return;
688 if (pep.Sequence.Length < _minPepLength || pep.Sequence.Length > _maxPepLength)
689 return;
690
691 if (pep.Sequence.Contains('B') || pep.Sequence.Contains('Z') || pep.Sequence.Contains('X'))
692 {
693 string temp = pep.Sequence.Replace("X", "");
694 if (pep.Sequence.Length - temp.Length > 1)
695 {
696 return;
697 }
698
699 temp = pep.Sequence.Replace("B", "");
700 temp = temp.Replace("Z", "");
701 if (pep.Sequence.Length - temp.Length > 3)
702 {
703 return;
704 }
705
706 List<char> replacements = new List<char>();
707 for (int i = 0; i < pep.Sequence.Length; ++i)
708 {
709 replacements.Add('#');
710 }
711
712 if (pep.Sequence.Contains('X'))
713 {
714 GenerateCombinationsForX(pep, replacements);
715 }
716 else
717 {
718 CalculateMass(pep, replacements);
719 }
720 }
721 else
722 {
723 pep.Mass = ChemicalUtils.CalculatePeptideMass(pep.Sequence, _useMonoisotopicMass);
724 SavePeptide(pep);
725 }
726 }
727
728 private static List<DBPeptide> SplitProtein(Enzyme enzyme, DBProtein prot, int missedCleavages, int offset)
729 {
730 string[] peptides = enzyme.TheRegex.Split(prot.Sequence);
731 List<DBPeptide> peps = new List<DBPeptide>(offset == 0 ? peptides.Length * 3 : 3); // guessing magic numbers
732 int start = offset;
733
734 Dictionary<string, PeptideInfo> allPeps = new Dictionary<string, PeptideInfo>();
735 for (int i = 0; i < peptides.Length; ++i)
736 {
737 if (offset > 0 && i > 0)
738 break;
739 if (!String.IsNullOrEmpty(peptides[i]))
740 {
741 string myPeptide = peptides[i];
742 if (!allPeps.ContainsKey(myPeptide))
743 {
744 //peps.Add(new DBPeptide(myPeptide, myPeptide, 0, IsProteinStart(0, myPeptide[0]), prot.DbProtRef));
745 peps.Add(new DBPeptide(myPeptide, myPeptide, 0, IsProteinStart(myPeptide, prot.Sequence), prot.DbProtRef));
746 allPeps.Add(peptides[i], new PeptideInfo
747 {
748 Start = start,
749 MissedCleavages = 0
750 });
751 }
752
753 int j = 1;
754 string pep = peptides[i];
755
756 int count = 0;
757 while (count < missedCleavages)
758 {
759 if (i + j < peptides.Length)
760 {
761 if (!String.IsNullOrEmpty(peptides[i + j]))
762 {
763 pep += peptides[i + j];
764 if (!allPeps.ContainsKey(pep))
765 {
766 //peps.Add(new DBPeptide(pep, pep, count + 1, IsProteinStart(0, pep[0]), prot.DbProtRef));
767 peps.Add(new DBPeptide(pep, pep, count + 1, IsProteinStart(pep, prot.Sequence), prot.DbProtRef));
768 allPeps.Add(pep, new PeptideInfo
769 {
770 Start = start,
771 MissedCleavages = count + 1
772 });
773 }
774
775 ++count;
776 }
777
778 ++j;
779 }
780 else
781 {
782 break;
783 }
784 }
785
786 start += peptides[i].Length;
787 }
788 }
789
790 if (enzyme.CleavageSites != "X" && (enzyme.Specificity == Enzyme.CLEAVAGE_SPECIFICITY.SEMI ||
791 enzyme.Specificity == Enzyme.CLEAVAGE_SPECIFICITY.SEMI_C ||
792 enzyme.Specificity == Enzyme.CLEAVAGE_SPECIFICITY.SEMI_N))
793 {
794 foreach (string peptide in allPeps.Keys.ToArray())
795 {
796 PeptideInfo info = allPeps[peptide];
797 for (int i = 1; i < peptide.Length - 1; ++i)
798 {
799 string newFront = peptide.Substring(i);
800 string newBack = peptide.Substring(0, peptide.Length - i);
801 if (!allPeps.ContainsKey(newFront) && (enzyme.Specificity == Enzyme.CLEAVAGE_SPECIFICITY.SEMI ||
802 enzyme.Specificity == Enzyme.CLEAVAGE_SPECIFICITY.SEMI_C))
803 {
804 //peps.Add(new DBPeptide(newFront, newFront, info.MissedCleavages, IsProteinStart(0, newFront[0]), prot.DbProtRef));
805 peps.Add(new DBPeptide(newFront, newFront, info.MissedCleavages, IsProteinStart(newFront, prot.Sequence), prot.DbProtRef));
806
807 allPeps.Add(newFront, new PeptideInfo
808 {
809 Start = info.Start + i,
810 MissedCleavages = info.MissedCleavages
811 });
812 }
813
814 if (allPeps.ContainsKey(newBack) || (enzyme.Specificity != Enzyme.CLEAVAGE_SPECIFICITY.SEMI &&
815 enzyme.Specificity != Enzyme.CLEAVAGE_SPECIFICITY.SEMI_N)) continue;
816
817 //peps.Add(new DBPeptide(newBack, newBack, info.MissedCleavages, IsProteinStart(0, newBack[0]), prot.DbProtRef));
818 peps.Add(new DBPeptide(newBack, newBack, info.MissedCleavages, IsProteinStart(newBack, prot.Sequence), prot.DbProtRef));
819 allPeps.Add(newBack, new PeptideInfo
820 {
821 Start = info.Start,
822 MissedCleavages = info.MissedCleavages
823 });
824 }
825 }
826 }
827
828 return peps;
829 }
830
831 private void SavePeptide(DBPeptide pep)
832 {
833 if (pep.Sequence.Length < 2)
834 return;
835 if (!pep.Sequence.Length.IsBetweenIncludeBounds(_minPepLength, _maxPepLength))
836 return;
837
838 pep.Mass = ChemicalUtils.CalculatePeptideMass(pep.Sequence, _useMonoisotopicMass);
839 pep.MassInt = (int)(pep.Mass);
840 pep.SeqHash = pep.CreateMD5();
841 _dbPeptides.Add(pep);
842 }
843
844 private DBPeptide CreatePeptide(string sequence, string origSequence, int missedCleavages, bool proteinStartFlag, DBProtRef protRef)
845 {
846 var pep = new DBPeptide(sequence, origSequence, missedCleavages, proteinStartFlag, protRef);
847 if (pep.Sequence.Length < 2)
848 return null;
849 if (pep.Sequence.Length < _minPepLength || pep.Sequence.Length > _maxPepLength)
850 return null;
851 pep.Mass = ChemicalUtils.CalculatePeptideMass(pep.Sequence, _useMonoisotopicMass);
852 pep.MassInt = (int)(pep.Mass);
853 pep.SeqHash = pep.CreateMD5();
854 return pep;
855 }
856
857 private static bool IsProteinStart(int startPosition, char protStarter)
858 {
859 return (startPosition == 0 || (startPosition == 1 && protStarter == 'M'));
860 }
861
862 private static bool IsProteinStart(string pep, string prot)
863 {
864 return IsProteinStart(prot.IndexOf(pep), prot[0]);
865 }
866
867 private void CalculateMass(DBPeptide info,
868 List<char> replacements)
869 {
870 if (info.Sequence.Contains('B'))
871 {
872 int firstIndex = info.Sequence.IndexOf('B');
873 string changedPeptide = info.Sequence;
874 char[] temp = new char[replacements.Count];
875 replacements.CopyTo(temp);
876 List<char> currentReplacements = new List<char>(temp)
877 {
878 [firstIndex] = 'N'
879 };
880 changedPeptide = changedPeptide.Insert(firstIndex, "N");
881 changedPeptide = changedPeptide.Remove(firstIndex + 1, 1);
882 info.Sequence = changedPeptide;
883 CalculateMass(info, currentReplacements);
884 changedPeptide = changedPeptide.Insert(firstIndex, "D");
885 changedPeptide = changedPeptide.Remove(firstIndex + 1, 1);
886 currentReplacements[firstIndex] = 'D';
887 info.Sequence = changedPeptide;
888 CalculateMass(info, currentReplacements);
889 }
890 else if (info.Sequence.Contains('Z'))
891 {
892 int firstIndex = info.Sequence.IndexOf('Z');
893 string changedPeptide = info.Sequence;
894 char[] temp = new char[replacements.Count];
895 replacements.CopyTo(temp);
896 List<char> currentReplacements = new List<char>(temp)
897 {
898 [firstIndex] = 'E'
899 };
900 changedPeptide = changedPeptide.Insert(firstIndex, "E");
901 changedPeptide = changedPeptide.Remove(firstIndex + 1, 1);
902 info.Sequence = changedPeptide;
903 CalculateMass(info, currentReplacements);
904 changedPeptide = changedPeptide.Insert(firstIndex, "Q");
905 changedPeptide = changedPeptide.Remove(firstIndex + 1, 1);
906 currentReplacements[firstIndex] = 'Q';
907 info.Sequence = changedPeptide;
908 CalculateMass(info, currentReplacements);
909 }
910 else
911 {
912 SavePeptide(info);
913 //FinalCalculation(info);
914 }
915 }
916
917 private void FinalCalculation(DBPeptide info)
918 {
919 double mass = ChemicalUtils.CalculatePeptideMass(info.Sequence, _useMonoisotopicMass);
920 SavePeptide(info);
921 }
922
923 private void GenerateCombinationsForX(DBPeptide info,
924 List<char> replacements)
925 {
926 foreach (char aa in ChemicalUtils.AminoAcids.Keys)
927 {
928 if (aa != '^' && aa != '$' && aa != 'J')
929 {
930 int firstIndex = info.Sequence.IndexOf('X');
931 string changedPeptide = info.Sequence;
932 changedPeptide = changedPeptide.Insert(firstIndex, aa.ToString());
933 changedPeptide = changedPeptide.Remove(firstIndex + 1, 1);
934 //char[] temp = new char[replacements.Count];
935 //replacements.CopyTo(temp);
936
937 List<char> currentReplacement = new List<char>(replacements.ToArray())
938 {
939 [firstIndex] = aa
940 };
941 //string currentReplacement = variableAAs + aa.ToString();
942
943 if (changedPeptide.Contains('X'))
944 {
945 //should not occur
946 }
947 else
948 {
949 var newPep = new DBPeptide
950 {
951 MissedCleavages = info.MissedCleavages,
952 Sequence = changedPeptide,
953 SequenceOriginal = info.SequenceOriginal,
954 ProteinStartFlag = info.ProteinStartFlag,
955 DbProtRefs = info.DbProtRefs
956 };
957 CalculateMass(newPep, currentReplacement);
958 }
959 }
960 }
961 }
962 }
963
964 [MessagePackObject]
965 public class DigesterDB
966 {
967 [Key(0)]
968 public Dictionary<int, List<DBPeptide>> DbPeptidesDictMassKey { get; set; }
969
970 public DigesterDB()
971 {
972 DbPeptidesDictMassKey = new Dictionary<int, List<DBPeptide>>();
973 }
974 }
975}
Simplified peptide class that stores peptide/peptidoform information.
Definition Database.cs:9
Settings for digestion, ion calculation and VectorSearch.
Definition Settings.cs:9
int MAX_CLEAVAGES
Maximum number of missed cleavages allowed during digestion.
Definition Settings.cs:14
Dictionary< string, double > FIXED_MODIFICATIONS
Dictionary for fixed modifications that maps amino acids to their possible modification masses.
Definition Settings.cs:44
Dictionary< string, double > VARIABLE_MODIFICATIONS
Dictionary for variable modifications that maps amino acids to their possible modification masses.
Definition Settings.cs:48
int MIN_PEP_LENGTH
Minimum peptide length.
Definition Settings.cs:18
int MAX_PEP_LENGTH
Maximum peptide length.
Definition Settings.cs:22
DBPeptide(string sequence, string sequenceOriginal, int missedCleavages, bool proteinStartFlag, DBProtRef protRef)
DBProtein(string identifier, int id, string sequence, bool isDecoy=false)
Dictionary< int, List< DBPeptide > > DbPeptidesDictMassKey
CLEAVAGE_SPECIFICITY Specificity
static List< Peptide > DigestFasta(string fastaFileName, Settings settings, bool generateDecoys=false, double coreUsage=0.75)
List< DBPeptide > DigestProteinIntoList()
ProteinDigester(Enzyme enzyme, int missedCleavages, bool useMonoisotopicMass, int minPepLength, int maxPepLength, DBProtein dbProtein)