hash_util.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. from pandas import read_csv
  2. from numpy import float64
  3. class HashString(object):
  4. AMINO_ACIDS = {'A':0, 'R': 1, 'N':2,'D':3,'C':4, 'Q':5,'E':6,'G':7, 'H':8, 'I':9, 'L':10, 'K': 11, 'M': 12, 'F':13,'P':14, 'S':15, 'T':16, 'W':17,'Y':18,'V':19}
  5. MOD = 2**64-1
  6. BASE = 20
  7. def __init__(self, string_to_hash=""):
  8. """
  9. Class to compute rolling hash values for peptide sequences.
  10. The hash function can be rolled over longer sequences by repeatedly
  11. using the pop_front and insert methods.
  12. The size and hash (probably) uniquely identify the peptide.
  13. Use the get-methods to fetch attributes.
  14. Parameters
  15. ----------
  16. string_to_hash : TYPE, optional
  17. Optional starting peptide sequence. The default is "".
  18. Returns
  19. -------
  20. None.
  21. """
  22. string_to_hash = string_to_hash.upper()
  23. self.hash_value = 0
  24. self.first_index = 0
  25. self.size = 0
  26. self.charstring = []
  27. while(self.size < len(string_to_hash)):
  28. self.insert(string_to_hash[self.size])
  29. def _po(self,a,b):
  30. """
  31. Fast way of computing large powers.
  32. Parameters
  33. ----------
  34. a : float
  35. Base.
  36. b : int
  37. Exponent.
  38. Returns
  39. -------
  40. float
  41. a**b.
  42. """
  43. if b == 0:
  44. return 1
  45. c = self._po(a,b // 2)
  46. if b % 2 == 1:
  47. return (((c*c)%self.MOD)*a) % self.MOD
  48. else:
  49. return (c*c)%self.MOD
  50. def __eq__(self,other):
  51. return (self.size == other.size) and (self.hash_value == other.hash_value)
  52. def __str__(self):
  53. return ''.join(self.charstring) + " : " + str(self.hash_value)
  54. def insert(self,char):
  55. """
  56. Inserts a character at the end of the sequence.
  57. Parameters
  58. ----------
  59. char : char
  60. Character to insert.
  61. Returns
  62. -------
  63. None.
  64. """
  65. char = char.upper()
  66. self.hash_value *= self.BASE
  67. self.hash_value += self.AMINO_ACIDS[char]
  68. self.hash_value %= self.MOD
  69. self.charstring.append(char)
  70. self.size += 1
  71. def pop_front(self):
  72. """
  73. Removes the first character in the string.
  74. Raises
  75. ------
  76. IndexError
  77. When the hash string already is of length 0.
  78. Returns
  79. -------
  80. char
  81. The removed character.
  82. """
  83. if self.size == 0:
  84. raise IndexError('Unable to pop HashString of length 0')
  85. self.hash_value -= self.AMINO_ACIDS[self.charstring[self.first_index]]*self._po(self.BASE, self.size - 1)
  86. while(self.hash_value < 0):
  87. self.hash_value += self.MOD
  88. self.hash_value %= self.MOD
  89. self.first_index += 1
  90. self.size -= 1
  91. return self.charstring[self.first_index-1]
  92. def pop_back(self):
  93. """
  94. Removes last character in sequence.
  95. Raises
  96. ------
  97. IndexError
  98. When sequence already is of length 0.
  99. Returns
  100. -------
  101. char
  102. The removed character.
  103. """
  104. if self.size == 0:
  105. raise IndexError('Unable to pop HashString of length 0')
  106. self.hash_value -= (self.AMINO_ACIDS[self.charstring[self.size-1]])
  107. self.hash_value = self.hash_value // self.BASE
  108. while(self.hash_value < 0):
  109. self.hash_value += self.MOD
  110. self.hash_value %= self.MOD
  111. self.size -= 1
  112. return self.charstring.pop()
  113. def getString(self):
  114. return ''.join(self.charstring)
  115. def getHash(self):
  116. return self.hash_value
  117. def getSize(self):
  118. return self.size
  119. class PeptideSequence(HashString):
  120. def __init__(self, sequence):
  121. """
  122. Utility class to easily fetch and hash peptide sequences.
  123. When querying the clean dataframes, use the loc method as follows:
  124. peptide_sequence = PeptideSequence('GATCA')
  125. info = clean_df.loc[peptide_sequence.loc(),:]
  126. Parameters
  127. ----------
  128. sequence : string
  129. Peptide sequence to be hashed.
  130. Returns
  131. -------
  132. None.
  133. """
  134. super().__init__(sequence)
  135. def loc(self):
  136. return (self.getSize(), self.getHash())
  137. def import_clean_data(path):
  138. return read_csv(path, index_col=[0,1], header=0)