utils.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. import random
  2. import numpy as np
  3. import os
  4. import feather
  5. import hashlib
  6. import re
  7. import time
  8. import heapq
  9. import pickle
  10. import dask.dataframe as dd
  11. def read_data(data_folder, filename, nrows, text_column, config):
  12. """
  13. Reads the dataframe with all data instances.
  14. :param data_folder: str, name of the folder containing source data
  15. :param filename: str, name of the file containing source data
  16. :param nrows: int, number of data instances sampled for analysis
  17. :param text_column: str, name of the column containing text instances in the data file
  18. :param config: an instance of ExperimentConfig class
  19. :return: pandas data frame, an instance of ExperimentConfig class
  20. """
  21. # define the file path
  22. filepath = f'./{data_folder}/{filename}'
  23. # read the data file
  24. df = feather.read_dataframe(filepath)
  25. # use only the defined part of the data set
  26. if nrows >= len(df):
  27. config.nrows = len(df)
  28. df = df.sample(n=config.nrows, random_state=17)
  29. # compute chech sum of the original text instances and add it to the data frame
  30. def compute_check_sum(x):
  31. x = hashlib.md5(x.encode("utf-8")).hexdigest()
  32. return x
  33. df['check_sum'] = df[text_column].map(lambda x: compute_check_sum(x))
  34. df = df.set_index('check_sum', drop=False)
  35. # store the data frame
  36. columns_to_return = [text_column, "check_sum"]
  37. config.df = df[columns_to_return]
  38. return config
  39. def return_selected_window_of_tokens(config):
  40. """
  41. A function which enables pre-selection of a part of the original text instance.
  42. It implements the 'keep beginning tokens' naive method and the Text Guide method.
  43. :param config: an instance of config.ExperimentConfig(). It includes the original data set.
  44. :return: config: an instance of config.ExperimentConfig(). It includes the modified data set.
  45. """
  46. def create_pseudo_instance(token_list, full_tokenized_instance, desired_length, selected_words_list,
  47. one_side_neighbours, number_of_important_token_occurrences):
  48. x = full_tokenized_instance
  49. # create a dictionary of position indexes of each token
  50. idx_dict = dict()
  51. for token in selected_words_list:
  52. index_list = [i for i, x in enumerate(x) if x == token]
  53. if len(index_list) != 0:
  54. idx_dict[token] = index_list
  55. # create pseudo sentences from the important token and surrounding tokens
  56. selected_neighbourhoods = list()
  57. for token in idx_dict:
  58. count = 0
  59. for token_index in idx_dict[token]:
  60. selected_neighbourhoods.append(x[(token_index - one_side_neighbours):
  61. (token_index + one_side_neighbours + 1)])
  62. count += 1
  63. if number_of_important_token_occurrences <= count:
  64. break
  65. # create final text instance
  66. for pseudo_sentence in selected_neighbourhoods:
  67. token_list.extend(pseudo_sentence)
  68. # sometimes the resulting instance would be too long, so we are taking only first n pseudo sentences,
  69. # the order does matter as the first ones come from the more important tokens
  70. if len(token_list) >= desired_length - (2 * one_side_neighbours + 1):
  71. break
  72. # sometimes, the resulting text instance is not using the 510 token limit.
  73. if config.fill_up_to_limit:
  74. # If so, the new text instance is filled with first k tokens until the 510 token limit is reached, even if
  75. # this doubles some sentences
  76. if len(token_list) < desired_length:
  77. number_missing_tokens = desired_length - len(token_list)
  78. token_list.extend(x[:number_missing_tokens])
  79. return token_list
  80. def prepare_for_sentence_feature_importance(config):
  81. """
  82. A function which prepares a word list sorted according to selected method.
  83. :param config: a config.ExperimentConfig() instance
  84. :return: list of str (sITFL i.e., selected words used by the Text Guide method).
  85. """
  86. # if the configuration file didn't provide a sorting method, use 'descending'
  87. if not hasattr(config, 'feature_importance_sort'):
  88. config.feature_importance_sort = 'descending'
  89. # read the file with precomputed feature importance
  90. filepath = os.path.join(config.data_folder, config.feature_importance_file)
  91. with open(filepath, 'rb') as f:
  92. feature_importances = pickle.load(f)
  93. # get n most important features sorted according to the selected method
  94. important_features = dict()
  95. # this eliminates non-token features that were used by the ML Classifier
  96. for key in feature_importances:
  97. if key.find("f_") == -1:
  98. important_features[key] = feature_importances[key]
  99. if config.feature_importance_sort == 'descending':
  100. # this sorts the features by importance value
  101. feature_importances = {k: v for k, v in heapq.nlargest(4000, important_features.items(),
  102. key=lambda i: i[1])}
  103. elif config.feature_importance_sort == 'nsmallest':
  104. # this sorts the features by importance value
  105. feature_importances = {k: v for k, v in heapq.nsmallest(4000, important_features.items(),
  106. key=lambda i: i[1])}
  107. elif config.feature_importance_sort == 'random':
  108. # this shuffles the features randomly
  109. l = list(feature_importances.items())
  110. random.shuffle(l)
  111. feature_importances = dict(l)
  112. # return the sITFL i.e., sorted list of important feature tokens
  113. return list(feature_importances.keys())
  114. def text_guide(original_tokenized_instance: list, desired_length: int, selected_words_list: list,
  115. one_side_neighbours: int, beg_part: float, end_part: float, over_length: float,
  116. number_of_important_token_occurrences: int):
  117. """
  118. A function which implements the Text Guide text preselection method useful for long text classification.
  119. :param original_tokenized_instance: list of str. The tokenized text instance.
  120. :param desired_length: int. Defines the length of final text instance by the number of tokens.
  121. :param selected_words_list: list of str. List of tokens used a guides for selecting informative text parts.
  122. :param one_side_neighbours: int. Number of tokens to be taken as neighbours providing context from one side
  123. of the selected word.
  124. :param beg_part: float. len(x)*beg_part defines the number of tokens from the beggining of the original
  125. text instance to be used also in the final text instance.
  126. :param end_part: float. len(x)*end_part defines the number of tokens from the end of the original
  127. text instance to be used also in the final text instance.
  128. :param over_length: float. The Text Guide method will be applied only if a condition is met:
  129. instance_length < int(desired_length * over_length)
  130. :param number_of_important_token_occurrences: int. For example, if 1, for each token from selected_words_list
  131. only the first occurrence of that token in the original text instance will be used. If 2,
  132. two first occurrences and so on.
  133. :return: str. Modified text instance.
  134. """
  135. x = original_tokenized_instance
  136. instance_length = len(x)
  137. if instance_length < int(desired_length * over_length):
  138. return " ".join(x[:desired_length])
  139. else:
  140. # create the final text instance
  141. n_first_tokens_to_keep = int(desired_length * beg_part)
  142. first_part = x[:n_first_tokens_to_keep]
  143. n_last_tokens_to_keep = int(desired_length * end_part)
  144. if not n_last_tokens_to_keep == 0:
  145. ending_part = x[-n_last_tokens_to_keep:]
  146. remainder_x = x[n_first_tokens_to_keep:-n_last_tokens_to_keep]
  147. else:
  148. ending_part = list()
  149. remainder_x = x[n_first_tokens_to_keep:]
  150. first_part.extend(ending_part)
  151. x = remainder_x
  152. final_text = create_pseudo_instance(token_list=first_part,
  153. full_tokenized_instance=x,
  154. desired_length=desired_length,
  155. selected_words_list=selected_words_list,
  156. one_side_neighbours=one_side_neighbours,
  157. number_of_important_token_occurrences=
  158. number_of_important_token_occurrences)
  159. return " ".join(final_text)
  160. # tokenize all text instances
  161. tokenized_col_name = 'tokenized'
  162. config.df[tokenized_col_name] = config.df[config.text_column].map(lambda x: x.split(' '))
  163. # define the name of the column with new text instances
  164. new_text_column = "new_text"
  165. # read the desired token length of new text instances
  166. desired_length = config.desired_length
  167. if config.truncation_method == "beginning":
  168. print("New text instances are created according to the naive 'keep beginning tokens' method.")
  169. config.df[new_text_column] = config.df[tokenized_col_name].map(lambda x: " ".join(x[:desired_length]))
  170. elif config.truncation_method == "text_guide":
  171. print("New text instances are created according to Text Guide.")
  172. selected_words_list = prepare_for_sentence_feature_importance(config)
  173. # this method uses dask for improved performance. Adapt npartitions to the number of cores
  174. # available on your machine
  175. one_side_neighbours = config.one_side_neighbours
  176. number_of_important_token_occurrences = config.number_of_important_token_occurrences
  177. ddata = dd.from_pandas(config.df, npartitions=config.cpu_threads)
  178. beg_part = config.beg_part
  179. end_part = config.end_part
  180. over_length = config.over_length
  181. config.df[new_text_column] = ddata \
  182. .map_partitions(lambda df: df.apply((lambda row: text_guide(
  183. original_tokenized_instance=row[tokenized_col_name],
  184. desired_length=desired_length,
  185. selected_words_list=selected_words_list,
  186. one_side_neighbours=one_side_neighbours,
  187. beg_part=beg_part,
  188. end_part=end_part,
  189. over_length=over_length,
  190. number_of_important_token_occurrences=number_of_important_token_occurrences)), axis=1)) \
  191. .compute(scheduler='processes')
  192. config.df[config.text_column] = config.df[new_text_column]
  193. config.df.drop(tokenized_col_name, axis=1, inplace=True)
  194. config.df.drop(new_text_column, axis=1, inplace=True)
  195. return config