123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173 |
- from pandas import read_csv
- from numpy import float64
- class HashString(object):
- AMINO_ACIDS = {'A':0, 'R': 1, 'N':2,'D':3,'C':4, 'Q':5,'E':6,'G':7, 'H':8, 'I':9, 'L':10, 'K': 11, 'M': 12, 'F':13,'P':14, 'S':15, 'T':16, 'W':17,'Y':18,'V':19}
- MOD = 2**64-1
- BASE = 20
- def __init__(self, string_to_hash=""):
- """
- Class to compute rolling hash values for peptide sequences.
- The hash function can be rolled over longer sequences by repeatedly
- using the pop_front and insert methods.
- The size and hash (probably) uniquely identify the peptide.
- Use the get-methods to fetch attributes.
- Parameters
- ----------
- string_to_hash : TYPE, optional
- Optional starting peptide sequence. The default is "".
- Returns
- -------
- None.
- """
- string_to_hash = string_to_hash.upper()
- self.hash_value = 0
- self.first_index = 0
- self.size = 0
- self.charstring = []
-
- while(self.size < len(string_to_hash)):
- self.insert(string_to_hash[self.size])
- def _po(self,a,b):
- """
- Fast way of computing large powers.
- Parameters
- ----------
- a : float
- Base.
- b : int
- Exponent.
- Returns
- -------
- float
- a**b.
- """
- if b == 0:
- return 1
- c = self._po(a,b // 2)
- if b % 2 == 1:
- return (((c*c)%self.MOD)*a) % self.MOD
- else:
- return (c*c)%self.MOD
- def __eq__(self,other):
- return (self.size == other.size) and (self.hash_value == other.hash_value)
-
- def __str__(self):
- return ''.join(self.charstring) + " : " + str(self.hash_value)
- def insert(self,char):
- """
- Inserts a character at the end of the sequence.
- Parameters
- ----------
- char : char
- Character to insert.
- Returns
- -------
- None.
- """
- char = char.upper()
- self.hash_value *= self.BASE
- self.hash_value += self.AMINO_ACIDS[char]
- self.hash_value %= self.MOD
- self.charstring.append(char)
- self.size += 1
- def pop_front(self):
- """
- Removes the first character in the string.
- Raises
- ------
- IndexError
- When the hash string already is of length 0.
- Returns
- -------
- char
- The removed character.
- """
-
-
- if self.size == 0:
- raise IndexError('Unable to pop HashString of length 0')
-
- self.hash_value -= self.AMINO_ACIDS[self.charstring[self.first_index]]*self._po(self.BASE, self.size - 1)
- while(self.hash_value < 0):
- self.hash_value += self.MOD
- self.hash_value %= self.MOD
- self.first_index += 1
- self.size -= 1
- return self.charstring[self.first_index-1]
- def pop_back(self):
- """
- Removes last character in sequence.
- Raises
- ------
- IndexError
- When sequence already is of length 0.
- Returns
- -------
- char
- The removed character.
- """
- if self.size == 0:
- raise IndexError('Unable to pop HashString of length 0')
-
- self.hash_value -= (self.AMINO_ACIDS[self.charstring[self.size-1]])
- self.hash_value = self.hash_value // self.BASE
- while(self.hash_value < 0):
- self.hash_value += self.MOD
- self.hash_value %= self.MOD
- self.size -= 1
- return self.charstring.pop()
- def getString(self):
- return ''.join(self.charstring)
-
- def getHash(self):
- return self.hash_value
-
- def getSize(self):
- return self.size
- class PeptideSequence(HashString):
- def __init__(self, sequence):
- """
- Utility class to easily fetch and hash peptide sequences.
- When querying the clean dataframes, use the loc method as follows:
- peptide_sequence = PeptideSequence('GATCA')
- info = clean_df.loc[peptide_sequence.loc(),:]
- Parameters
- ----------
- sequence : string
- Peptide sequence to be hashed.
- Returns
- -------
- None.
- """
- super().__init__(sequence)
-
- def loc(self):
- return (self.getSize(), self.getHash())
-
-
- def import_clean_data(path):
- return read_csv(path, index_col=[0,1], header=0)
|