14 bool generateDecoys =
false,
15 double coreUsage = 0.75)
17 var trypsin =
new Enzyme();
18 trypsin.Name =
"Trypsin";
19 trypsin.CleavageSites =
"KR";
20 trypsin.CleavageInhibitors =
"P";
24 var proteins = ReadInFasta(fastaFileName,
false);
26 var peptides = DigestProteins(proteins,
34 var decoyProteins =
new List<DBProtein>();
35 foreach (var protein
in proteins)
37 var decoySequence = ReverseSequence(protein.Sequence);
38 var decoyProtein =
new DBProtein(protein.DbProtRef.ProtIdentifier, -protein.DbProtRef.MappingId, decoySequence,
true);
39 decoyProteins.Add(decoyProtein);
42 var decoyPeptides = DigestProteins(decoyProteins,
48 peptides.AddRange(decoyPeptides);
54 private static string ReverseSequence(
string seq)
56 char[] array = seq.ToCharArray();
58 return new String(array);
61 private static List<Peptide> DigestProteins(List<DBProtein> proteins,
67 var opts =
new ParallelOptions { MaxDegreeOfParallelism = (int) Math.Ceiling(Environment.ProcessorCount * coreUsage) };
68 var concurrentPeptideList =
new ConcurrentBag<List<DBPeptide>>();
69 Parallel.ForEach(proteins, opts, (protein) => {
71 concurrentPeptideList.Add(digester.DigestProteinIntoList());
74 var peptideList = concurrentPeptideList.SelectMany(x => x).ToList();
75 var massToPeptides =
new DigesterDB();
76 HelperMethods.MergeToDBDictionaries(peptideList, ref massToPeptides, opts);
78 var peptides =
new List<Peptide>();
79 foreach (var item
in massToPeptides.DbPeptidesDictMassKey)
81 var currentPeptides = item.Value;
82 foreach (var peptide
in currentPeptides)
84 var peptidoforms = GetPeptidoforms(peptide, settings, isDecoy);
85 peptides.AddRange(peptidoforms);
92 private static List<Peptide> GetPeptidoforms(DBPeptide dbPeptide,
Settings settings,
bool isDecoy)
94 var peptides =
new List<Peptide>();
95 var mods =
new Dictionary<int, double>();
99 for (
int i = 0; i < dbPeptide.Sequence.Length; i++)
108 var peptide =
new Peptide(dbPeptide.Sequence, dbPeptide.Mass, mods, settings, isDecoy);
109 peptides.Add(peptide);
115 addPeptidoformsForModification(peptides, modification, settings);
122 private static void addPeptidoformsForModification(List<Peptide> peptides,
123 KeyValuePair<string, double> modification,
126 var peptidoforms =
new List<Peptide>();
128 foreach (var peptide
in peptides)
130 var possibleModificationSites =
new List<int>();
131 for (
int i = 0; i < peptide.sequence.Length; i++)
133 if (peptide.sequence[i].ToString() == modification.Key)
135 possibleModificationSites.Add(i);
139 var possibleCombinations = getAllPossibleCombinations(possibleModificationSites);
141 foreach (var combination
in possibleCombinations)
143 var peptidoform =
new Peptide(peptide.sequence,
145 new Dictionary<int, double>(),
149 foreach (var mod
in peptide.modifications)
151 peptidoform.addModification(mod.Key, mod.Value);
155 foreach (var position
in combination)
157 peptidoform.addModification(position, modification.Value);
160 peptidoforms.Add(peptidoform);
164 peptides.AddRange(peptidoforms);
167 private static List<List<int>> getAllPossibleCombinations(List<int> possibleModificationSites)
169 var possibleCombinations =
new List<List<int>>();
171 for (
int i = 0; i < (1 << possibleModificationSites.Count); ++i)
173 var combination =
new List<int>();
174 for (
int j = 0; j < possibleModificationSites.Count; ++j)
176 if ((i & (1 << j)) != 0)
178 combination.Add(possibleModificationSites[j]);
181 possibleCombinations.Add(combination);
184 return possibleCombinations;
187 private static List<DBProtein> ReadInFasta(
string fastaFileName,
bool isDecoy)
189 List<DBProtein> proteins =
new List<DBProtein>();
190 string regexPatternSequence =
"[^ARNDCEQGHILKMFPSTUWYVXBZJO]";
193 StreamReader fastaFileReader =
new StreamReader(fastaFileName);
196 string currentLine =
"";
197 string sequence =
"";
198 string identifier =
"";
200 while ((currentLine = fastaFileReader.ReadLine()) !=
null)
202 if (currentLine.StartsWith(
">", StringComparison.Ordinal))
204 if (!
string.IsNullOrWhiteSpace(sequence))
207 if (Regex.IsMatch(sequence, regexPatternSequence))
209 var builder =
new StringBuilder(
"Cannot parse ");
210 builder.Append(
"fasta file at identifier " + identifier +
". Sequence error. ");
211 Console.WriteLine(builder.ToString());
212 sequence = String.Empty;
213 identifier =
string.Empty;
214 throw new Exception(
"Parsing error.");
216 proteins.Add(GenerateDbProtein(mappingID, isDecoy, identifier, sequence));
221 int index = currentLine.IndexOfAny(
new[] {
' ',
'|' }, 0);
225 identifier = currentLine.Substring(1);
229 identifier = currentLine.Substring(1, index - 1);
230 int indexNumber = identifier.IndexOfAny(
new[]
231 {
'0',
'1',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9'});
232 while (indexNumber == -1)
234 index = currentLine.IndexOfAny(
new[] {
' ',
'|' }, index + 1);
237 identifier = currentLine.Substring(1);
241 identifier = currentLine.Substring(1, index - 1);
242 indexNumber = identifier.IndexOfAny(
new[]
243 {
'0',
'1',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9'});
249 sequence += currentLine.ToUpper();
253 if (!
string.IsNullOrWhiteSpace(sequence))
255 if (Regex.IsMatch(sequence, regexPatternSequence))
257 var builder =
new StringBuilder(
"Cannot parse ");
258 builder.Append(
"fasta file at identifier " + identifier +
". Sequence error. ");
259 Console.WriteLine(builder.ToString());
260 throw new Exception(
"Parsing error.");
264 proteins.Add(GenerateDbProtein(mappingID, isDecoy, identifier, sequence));
272 Console.WriteLine(
"Fasta file error");
273 Console.WriteLine(e.ToString());
278 fastaFileReader.Close();
284 private static DBProtein GenerateDbProtein(
int mappingID,
bool isDecoy,
string identifier,
string sequence)
290 return new DBProtein(identifier, -mappingID, sequence);
292 return new DBProtein(identifier, mappingID, sequence);
538 private readonly
Enzyme _enzyme;
539 private readonly
int _missedCleavages;
540 private readonly
bool _useMonoisotopicMass;
541 private readonly
int _minPepLength;
542 private readonly
int _maxPepLength;
545 private List<DBPeptide> _dbPeptides;
550 public int MissedCleavages;
556 _missedCleavages = missedCleavages;
557 _useMonoisotopicMass = useMonoisotopicMass;
558 _minPepLength = minPepLength;
559 _maxPepLength = maxPepLength;
560 _dbProtein = dbProtein;
561 _dbPeptides =
new List<DBPeptide>();
566 _dbPeptides =
new List<DBPeptide>();
567 DigestSingleProtein(_dbProtein);
571 private void DigestSingleProtein(
DBProtein protein)
574 if (_enzyme.CleavageSites ==
"X")
577 DigestSingleProteinWithNoEnzyme(protein);
579 else if (_enzyme.CleavageSites ==
"")
586 var peptides = SplitProtein(_enzyme, protein, _missedCleavages, 0);
587 foreach (var pep
in peptides)
589 SaveSinglePeptide(pep);
592 if (protein.
Sequence.StartsWith(
"M", StringComparison.Ordinal) && _enzyme.CleavageSites !=
"X" && _enzyme.CleavageSites !=
"")
595 var otherPeptides = SplitProtein(_enzyme, prot, _missedCleavages, 1);
597 foreach (var pep
in otherPeptides)
599 var peptideWithSameHash = peptides.FindPeptideWithSameHash(pep);
600 if (peptideWithSameHash ==
null)
602 SaveSinglePeptide(pep);
605 if (peptideWithSameHash.MissedCleavages == pep.MissedCleavages)
continue;
607 SaveSinglePeptide(pep);
613 private void DigestSingleProteinWithNoEnzyme(DBProtein protein)
615 SortedSet<int> positionsOfX =
new SortedSet<int>();
616 SortedSet<int> positionsOfZ =
new SortedSet<int>();
617 SortedSet<int> positionsOfB =
new SortedSet<int>();
618 MatchCollection collX = Regex.Matches(protein.Sequence,
"X");
619 MatchCollection collZ = Regex.Matches(protein.Sequence,
"Z");
620 MatchCollection collB = Regex.Matches(protein.Sequence,
"B");
622 for (
int m = 0; m < collX.Count; ++m)
624 positionsOfX.Add(collX[m].Index);
627 for (
int m = 0; m < collZ.Count; ++m)
629 positionsOfZ.Add(collZ[m].Index);
632 for (
int m = 0; m < collB.Count; ++m)
634 positionsOfB.Add(collB[m].Index);
637 for (
int s = 0; s < protein.Sequence.Length; ++s)
639 for (
int l = 1; l < protein.Sequence.Length - s + 1; ++l)
642 if (seqL < _minPepLength)
644 if (seqL > _maxPepLength)
648 SortedSet<int> xInSeq = positionsOfX.GetViewBetween(s, s + l);
649 if (xInSeq.Count > 1)
651 SortedSet<int> bInSeq = positionsOfB.GetViewBetween(s, s + l);
652 if (bInSeq.Count > 3)
654 SortedSet<int> zInSeq = positionsOfZ.GetViewBetween(s, s + l);
655 if (zInSeq.Count > 3 || zInSeq.Count + bInSeq.Count > 3)
657 if (xInSeq.Count > 0 || bInSeq.Count > 0 || zInSeq.Count > 0)
660 SaveSinglePeptide(
new DBPeptide(protein.Sequence.Substring(s, l), protein.Sequence.Substring(s, l), 0,
661 IsProteinStart(s, protein.Sequence[0]), protein.DbProtRef));
666 string sseq = protein.Sequence.Substring(s, l);
667 double mass = ChemicalUtils.CalculatePeptideMass(sseq, _useMonoisotopicMass);
669 if (mass.IsBetweenExcludeBounds(200, 6000))
677 SavePeptide(
new DBPeptide(sseq, sseq, 0, IsProteinStart(s, protein.Sequence[0]), protein.DbProtRef));
684 private void SaveSinglePeptide(DBPeptide pep)
686 if (String.IsNullOrEmpty(pep.Sequence))
688 if (pep.Sequence.Length < _minPepLength || pep.Sequence.Length > _maxPepLength)
691 if (pep.Sequence.Contains(
'B') || pep.Sequence.Contains(
'Z') || pep.Sequence.Contains(
'X'))
693 string temp = pep.Sequence.Replace(
"X",
"");
694 if (pep.Sequence.Length - temp.Length > 1)
699 temp = pep.Sequence.Replace(
"B",
"");
700 temp = temp.Replace(
"Z",
"");
701 if (pep.Sequence.Length - temp.Length > 3)
706 List<char> replacements =
new List<char>();
707 for (
int i = 0; i < pep.Sequence.Length; ++i)
709 replacements.Add(
'#');
712 if (pep.Sequence.Contains(
'X'))
714 GenerateCombinationsForX(pep, replacements);
718 CalculateMass(pep, replacements);
723 pep.Mass = ChemicalUtils.CalculatePeptideMass(pep.Sequence, _useMonoisotopicMass);
728 private static List<DBPeptide> SplitProtein(Enzyme enzyme, DBProtein prot,
int missedCleavages,
int offset)
730 string[] peptides = enzyme.TheRegex.Split(prot.Sequence);
731 List<DBPeptide> peps =
new List<DBPeptide>(offset == 0 ? peptides.Length * 3 : 3);
734 Dictionary<string, PeptideInfo> allPeps =
new Dictionary<string, PeptideInfo>();
735 for (
int i = 0; i < peptides.Length; ++i)
737 if (offset > 0 && i > 0)
739 if (!String.IsNullOrEmpty(peptides[i]))
741 string myPeptide = peptides[i];
742 if (!allPeps.ContainsKey(myPeptide))
745 peps.Add(
new DBPeptide(myPeptide, myPeptide, 0, IsProteinStart(myPeptide, prot.Sequence), prot.DbProtRef));
746 allPeps.Add(peptides[i],
new PeptideInfo
754 string pep = peptides[i];
757 while (count < missedCleavages)
759 if (i + j < peptides.Length)
761 if (!String.IsNullOrEmpty(peptides[i + j]))
763 pep += peptides[i + j];
764 if (!allPeps.ContainsKey(pep))
767 peps.Add(
new DBPeptide(pep, pep, count + 1, IsProteinStart(pep, prot.Sequence), prot.DbProtRef));
768 allPeps.Add(pep,
new PeptideInfo
771 MissedCleavages = count + 1
786 start += peptides[i].Length;
790 if (enzyme.CleavageSites !=
"X" && (enzyme.Specificity == Enzyme.CLEAVAGE_SPECIFICITY.SEMI ||
791 enzyme.Specificity == Enzyme.CLEAVAGE_SPECIFICITY.SEMI_C ||
792 enzyme.Specificity == Enzyme.CLEAVAGE_SPECIFICITY.SEMI_N))
794 foreach (
string peptide
in allPeps.Keys.ToArray())
796 PeptideInfo info = allPeps[peptide];
797 for (
int i = 1; i < peptide.Length - 1; ++i)
799 string newFront = peptide.Substring(i);
800 string newBack = peptide.Substring(0, peptide.Length - i);
801 if (!allPeps.ContainsKey(newFront) && (enzyme.Specificity == Enzyme.CLEAVAGE_SPECIFICITY.SEMI ||
802 enzyme.Specificity == Enzyme.CLEAVAGE_SPECIFICITY.SEMI_C))
805 peps.Add(
new DBPeptide(newFront, newFront, info.MissedCleavages, IsProteinStart(newFront, prot.Sequence), prot.DbProtRef));
807 allPeps.Add(newFront,
new PeptideInfo
809 Start = info.Start + i,
810 MissedCleavages = info.MissedCleavages
814 if (allPeps.ContainsKey(newBack) || (enzyme.Specificity != Enzyme.CLEAVAGE_SPECIFICITY.SEMI &&
815 enzyme.Specificity != Enzyme.CLEAVAGE_SPECIFICITY.SEMI_N))
continue;
818 peps.Add(
new DBPeptide(newBack, newBack, info.MissedCleavages, IsProteinStart(newBack, prot.Sequence), prot.DbProtRef));
819 allPeps.Add(newBack,
new PeptideInfo
822 MissedCleavages = info.MissedCleavages
831 private void SavePeptide(DBPeptide pep)
833 if (pep.Sequence.Length < 2)
835 if (!pep.Sequence.Length.IsBetweenIncludeBounds(_minPepLength, _maxPepLength))
838 pep.Mass = ChemicalUtils.CalculatePeptideMass(pep.Sequence, _useMonoisotopicMass);
839 pep.MassInt = (int)(pep.Mass);
840 pep.SeqHash = pep.CreateMD5();
841 _dbPeptides.Add(pep);
844 private DBPeptide CreatePeptide(
string sequence,
string origSequence,
int missedCleavages,
bool proteinStartFlag, DBProtRef protRef)
846 var pep =
new DBPeptide(sequence, origSequence, missedCleavages, proteinStartFlag, protRef);
847 if (pep.Sequence.Length < 2)
849 if (pep.Sequence.Length < _minPepLength || pep.Sequence.Length > _maxPepLength)
851 pep.Mass = ChemicalUtils.CalculatePeptideMass(pep.Sequence, _useMonoisotopicMass);
852 pep.MassInt = (int)(pep.Mass);
853 pep.SeqHash = pep.CreateMD5();
857 private static bool IsProteinStart(
int startPosition,
char protStarter)
859 return (startPosition == 0 || (startPosition == 1 && protStarter ==
'M'));
862 private static bool IsProteinStart(
string pep,
string prot)
864 return IsProteinStart(prot.IndexOf(pep), prot[0]);
867 private void CalculateMass(DBPeptide info,
868 List<char> replacements)
870 if (info.Sequence.Contains(
'B'))
872 int firstIndex = info.Sequence.IndexOf(
'B');
873 string changedPeptide = info.Sequence;
874 char[] temp =
new char[replacements.Count];
875 replacements.CopyTo(temp);
876 List<char> currentReplacements =
new List<char>(temp)
880 changedPeptide = changedPeptide.Insert(firstIndex,
"N");
881 changedPeptide = changedPeptide.Remove(firstIndex + 1, 1);
882 info.Sequence = changedPeptide;
883 CalculateMass(info, currentReplacements);
884 changedPeptide = changedPeptide.Insert(firstIndex,
"D");
885 changedPeptide = changedPeptide.Remove(firstIndex + 1, 1);
886 currentReplacements[firstIndex] =
'D';
887 info.Sequence = changedPeptide;
888 CalculateMass(info, currentReplacements);
890 else if (info.Sequence.Contains(
'Z'))
892 int firstIndex = info.Sequence.IndexOf(
'Z');
893 string changedPeptide = info.Sequence;
894 char[] temp =
new char[replacements.Count];
895 replacements.CopyTo(temp);
896 List<char> currentReplacements =
new List<char>(temp)
900 changedPeptide = changedPeptide.Insert(firstIndex,
"E");
901 changedPeptide = changedPeptide.Remove(firstIndex + 1, 1);
902 info.Sequence = changedPeptide;
903 CalculateMass(info, currentReplacements);
904 changedPeptide = changedPeptide.Insert(firstIndex,
"Q");
905 changedPeptide = changedPeptide.Remove(firstIndex + 1, 1);
906 currentReplacements[firstIndex] =
'Q';
907 info.Sequence = changedPeptide;
908 CalculateMass(info, currentReplacements);
917 private void FinalCalculation(DBPeptide info)
919 double mass = ChemicalUtils.CalculatePeptideMass(info.Sequence, _useMonoisotopicMass);
923 private void GenerateCombinationsForX(DBPeptide info,
924 List<char> replacements)
926 foreach (
char aa
in ChemicalUtils.AminoAcids.Keys)
928 if (aa !=
'^' && aa !=
'$' && aa !=
'J')
930 int firstIndex = info.Sequence.IndexOf(
'X');
931 string changedPeptide = info.Sequence;
932 changedPeptide = changedPeptide.Insert(firstIndex, aa.ToString());
933 changedPeptide = changedPeptide.Remove(firstIndex + 1, 1);
937 List<char> currentReplacement =
new List<char>(replacements.ToArray())
943 if (changedPeptide.Contains(
'X'))
949 var newPep =
new DBPeptide
951 MissedCleavages = info.MissedCleavages,
952 Sequence = changedPeptide,
953 SequenceOriginal = info.SequenceOriginal,
954 ProteinStartFlag = info.ProteinStartFlag,
955 DbProtRefs = info.DbProtRefs
957 CalculateMass(newPep, currentReplacement);