evaluate.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. """
  2. evaluate.py
  3. """
  4. import argparse
  5. import os
  6. import time
  7. from ds4ml.evaluator import BiFrame
  8. from ds4ml.utils import read_data_from_csv, CustomFormatter, str_to_list
  9. def main():
  10. parser = argparse.ArgumentParser(
  11. description='Evaluate the utility of synthesized dataset compared with '
  12. 'the source dataset.',
  13. formatter_class=CustomFormatter,
  14. add_help=False)
  15. # positional arguments
  16. parser.add_argument('source',
  17. help='set file path of source (raw) dataset to be '
  18. 'compared with synthesized dataset, only support '
  19. 'CSV files')
  20. parser.add_argument('target',
  21. help='set file path of target (synthesized) dataset to '
  22. 'evaluate')
  23. # optional arguments
  24. group = parser.add_argument_group('general arguments')
  25. group.add_argument("-h", "--help", action="help",
  26. help="show this help message and exit")
  27. group.add_argument('--na-values', metavar='LIST',
  28. help='set additional values to recognize as NA/NaN; ('
  29. 'default null values are from pandas.read_csv)')
  30. group.add_argument('-o', '--output', metavar='FILE', default='report.html',
  31. help='set output path for evaluation report; (default '
  32. 'is "report.html" under current work directory)')
  33. group = parser.add_argument_group('advanced arguments')
  34. group.add_argument('--category', metavar='LIST',
  35. help='set categorical columns separated by a comma.')
  36. group.add_argument('-t', '--test',
  37. help='set test dataset for classification or regression '
  38. 'task; (default take 20%% from source dataset)')
  39. group.add_argument('--class-label', metavar='LIST',
  40. help='set column name as class label for classification '
  41. 'or regression task; supports one or multiple '
  42. 'columns (separated by comma)')
  43. args = parser.parse_args()
  44. start = time.time()
  45. na_values = str_to_list(args.na_values)
  46. class_labels = str_to_list(args.class_label)
  47. categories = str_to_list(args.category)
  48. # check kinds of parameters
  49. args.output = os.path.join(os.getcwd(), args.output)
  50. # if output folder not exists, then create it.
  51. if not os.path.exists(os.path.dirname(args.output)):
  52. os.makedirs(os.path.dirname(args.output))
  53. def complement(attrs, full):
  54. return set(attrs or []) - set(full)
  55. # Initialization task:
  56. source = read_data_from_csv(args.source, na_values=na_values, header='infer')
  57. target = read_data_from_csv(args.target, na_values=na_values, header='infer')
  58. test = read_data_from_csv(args.test) if args.test is not None else None
  59. comp = complement(class_labels, source.columns)
  60. if comp:
  61. parser.exit(message=f'--class-label(s): {comp} are not in source file.')
  62. comp = complement(class_labels, target.columns)
  63. if comp:
  64. parser.exit(message=f'--class-label(s): {comp} are not in target file.')
  65. frame = BiFrame(source, target, categories=categories)
  66. frame.to_html(buffer=args.output, title='Data Utility Evaluation Report',
  67. labels=class_labels, test=test)
  68. duration = time.time() - start
  69. print(f'Evaluate dataset {args.source} and {args.target} and generate '
  70. f'report at {args.output} in {round(duration, 2)} seconds.')
  71. if __name__ == '__main__':
  72. # For Testing
  73. main()