text_guide_dmoz_example.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. from datetime import datetime
  2. from collections import OrderedDict
  3. import pathlib
  4. """
  5. Configuration file for Text Guide.
  6. How to use:
  7. Define folder containing data file, filename of the data file, name of the column in the data file containing
  8. unstructured textual data, select the truncation method and the name of the file (dictionary) containing
  9. pairs of "key: value" where key == important token, and value == value of feature importance. Select the sorting
  10. method for the important token dictionary.
  11. """
  12. class ExperimentConfig(OrderedDict):
  13. def __init__(self):
  14. super().__init__()
  15. self.start_time = datetime.utcnow()
  16. self.config_name = f"{pathlib.Path(__file__).stem}"
  17. self.df = None
  18. # specify files and names
  19. self.data_folder = 'data'
  20. self.filename = "dmoz_100_instances.ftr"
  21. self.text_column = "Text"
  22. self.nrows = 7 # the number of instances to convert. The Whole file dmoz_100_instances.ftr has 100 instances.
  23. self.truncation_method = 'text_guide' # or 'beginning'
  24. self.feature_importance_file = f"dmoz_30_1500_sITFL.p" # the attached file was obtained from a BoW model
  25. # and a gradient boosting classifier. For different data, different file is needed.
  26. self.feature_importance_sort = 'descending'
  27. # specify other Text Guide parameters
  28. self.desired_length = 300 # the desired length of the new text instance.
  29. self.one_side_neighbours = 3 # number of tokens surrounding the important token to be used for creating the
  30. # pseudo sentence
  31. self.beg_part = 0.1 # the part of the beginning of the original text instance to be used by Text Guide to
  32. # create the new text instance
  33. self.end_part = 0.2 # the part of the ending of the original text instance to be used by Text Guide to
  34. # create the new text instance
  35. self.over_length = 1 # if set to 1, Text Guide will be used for all tex instances. Text Guide will be used
  36. # only for instances with length greater than desired_length*over_length tokens.
  37. self.number_of_important_token_occurrences = 1 # number of important token occurrences to be used by
  38. # Text Guide to create pseudo sentences.
  39. self.fill_up_to_limit = True # some text instances created by Text Guide will be shorter than the
  40. # desired_length. Set to "True" if you wish to fill the remaining space by initial tokens of the original text
  41. # instance
  42. self.cpu_threads = 14 # number of cpu threads to be used by Dask when computing Text Guide instances