pattern.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. """
  2. pattern.py
  3. """
  4. from ds4ml.dataset import DataSet
  5. from ds4ml.utils import CustomFormatter, read_data_from_csv, file_name, str_to_list
  6. import argparse
  7. import time
  8. def main():
  9. parser = argparse.ArgumentParser(
  10. description='Serialize patterns of a dataset anonymously',
  11. formatter_class=CustomFormatter,
  12. add_help=False)
  13. parser.add_argument('file', help='set path of a csv file to be patterned '
  14. 'anonymously')
  15. # optional arguments
  16. group = parser.add_argument_group('general arguments')
  17. group.add_argument("-h", "--help", action="help",
  18. help="show this help message and exit")
  19. group.add_argument('--pseudonym', metavar='LIST',
  20. help='set candidate columns separated by a comma, which '
  21. 'will be replaced with a pseudonym. It only works '
  22. 'on the string column.')
  23. group.add_argument('--delete', metavar='LIST',
  24. help='set columns separated by a comma, which will be '
  25. 'deleted when synthesis.')
  26. group.add_argument('--na-values', metavar='LIST',
  27. help='set additional values to recognize as NA/NaN; '
  28. '(default null values are from pandas.read_csv)')
  29. group.add_argument('-o', '--output', metavar='FILE',
  30. help="set the file name of anonymous patterns (default "
  31. "is input file name with a suffix '-pattern.json')")
  32. group.add_argument('--no-header', action='store_true',
  33. help='indicate there is no header in a CSV file, and '
  34. 'will take [#0, #1, #2, ...] as header. (default: '
  35. 'the tool will try to detect and take actions)')
  36. group.add_argument('--sep', metavar='STRING',
  37. help='specify the delimiter of the input file')
  38. group = parser.add_argument_group('advanced arguments')
  39. group.add_argument('-e', '--epsilon', metavar='FLOAT', type=float,
  40. help='set epsilon for differential privacy (default 0.1)',
  41. default=0.1)
  42. group.add_argument('--category', metavar='LIST',
  43. help='set categorical columns separated by a comma.')
  44. args = parser.parse_args()
  45. start = time.time()
  46. pseudonyms = str_to_list(args.pseudonym)
  47. deletes = str_to_list(args.delete)
  48. categories = str_to_list(args.category)
  49. na_values = str_to_list(args.na_values)
  50. header = None if args.no_header else 'infer'
  51. sep = ',' if args.sep is None else args.sep
  52. data = read_data_from_csv(args.file, na_values=na_values, header=header,
  53. sep=sep)
  54. def complement(attrs, full):
  55. return set(attrs or []) - set(full)
  56. # check parameters: pseudonyms, deletes, categories
  57. comp = complement(pseudonyms, data.columns)
  58. if comp:
  59. parser.exit(message=f'--pseudonym columns: {comp} are not in csv file.')
  60. comp = complement(deletes, data.columns)
  61. if comp:
  62. parser.exit(message=f'--delete columns: {comp} are not in csv file.')
  63. comp = complement(categories, data.columns)
  64. if comp:
  65. parser.exit(message=f'--category columns: {comp} are not in csv file.')
  66. dataset = DataSet(data, categories=categories)
  67. if args.output is None:
  68. name = file_name(args.file)
  69. args.output = f'{name}-pattern.json'
  70. dataset.to_pattern(path=args.output, epsilon=args.epsilon, deletes=deletes,
  71. pseudonyms=pseudonyms, retains=[])
  72. duration = time.time() - start
  73. print(f'Analyze and serialize the patterns of {args.file} at {args.output} '
  74. f'in {round(duration, 2)} seconds.')
  75. if __name__ == '__main__':
  76. main()