LiuFan
/
PrivacyScanData


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
							from datetime import datetime
from collections import OrderedDict
import pathlib
"""
Configuration file for Text Guide.
How to use:
Define folder containing data file, filename of the data file, name of the column in the data file containing
unstructured textual data, select the truncation method and the name of the file (dictionary) containing
pairs of "key: value" where key == important token, and value == value of feature importance. Select the sorting
method for the important token dictionary.
"""


class ExperimentConfig(OrderedDict):

    def __init__(self):
        super().__init__()
        self.start_time = datetime.utcnow()
        self.config_name = f"{pathlib.Path(__file__).stem}"
        self.df = None

        # specify files and names
        self.data_folder = 'data'
        self.filename = "dmoz_100_instances.ftr"
        self.text_column = "Text"
        self.nrows = 7   # the number of instances to convert. The Whole file dmoz_100_instances.ftr has 100 instances.
        self.truncation_method = 'text_guide'   # or 'beginning'
        self.feature_importance_file = f"dmoz_30_1500_sITFL.p"      # the attached file was obtained from a BoW model
        # and a gradient boosting classifier. For different data, different file is needed.
        self.feature_importance_sort = 'descending'

        # specify other Text Guide parameters
        self.desired_length = 300       # the desired length of the new text instance.
        self.one_side_neighbours = 3    # number of tokens surrounding the important token to be used for creating the
        # pseudo sentence
        self.beg_part = 0.1     # the part of the beginning of the original text instance to be used by Text Guide to
        # create the new text instance
        self.end_part = 0.2     # the part of the ending of the original text instance to be used by Text Guide to
        # create the new text instance
        self.over_length = 1    # if set to 1, Text Guide will be used for all tex instances. Text Guide will be used
        # only for instances with length greater than desired_length*over_length tokens.
        self.number_of_important_token_occurrences = 1   # number of important token occurrences to be used by
        # Text Guide to create pseudo sentences.
        self.fill_up_to_limit = True   # some text instances created by Text Guide will be shorter than the
        # desired_length. Set to "True" if you wish to fill the remaining space by initial tokens of the original text
        # instance

        self.cpu_threads = 14   # number of cpu threads to be used by Dask when computing Text Guide instances