test_dataset.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. from pandas import DataFrame
  2. from numpy import array_equal
  3. from ds4ml.dataset import DataSet
  4. from .testdata import adults01
  5. def test_encode():
  6. from .testdata import adults01
  7. from numpy import array_equal
  8. dataset = DataSet(adults01)
  9. frame = dataset.encode()
  10. for col in ['education', 'relationship', 'salary']:
  11. assert col not in frame.columns
  12. for col in ['age', 'birth']:
  13. assert col in frame.columns
  14. assert 'salary_<=50K' in frame.columns
  15. assert 'salary_>50K' in frame.columns
  16. for attr, val in [('salary', '<=50K'),
  17. ('relationship', 'Wife'),
  18. ('relationship', 'Husband')]:
  19. trans_col = frame[f'{attr}_{val}'].apply(lambda v: v == 1)
  20. origin_col = adults01[attr] == val
  21. assert array_equal(trans_col, origin_col)
  22. def test_encode_partly():
  23. from .testdata import adults01
  24. from sklearn.model_selection import train_test_split
  25. dataset = DataSet(adults01)
  26. train, test = train_test_split(adults01, test_size=0.2)
  27. frame = dataset.encode(data=train)
  28. assert 'salary_<=50K' in frame.columns
  29. assert 'salary_>50K' in frame.columns
  30. assert ((0 == frame['salary_<=50K']) | (frame['salary_<=50K'] == 1)).all()
  31. assert ((0.0 <= frame['age']) & (frame['age'] <= 1.0)).all()
  32. def test_encode_empty_column():
  33. from numpy import array_equal
  34. data = [[1001, 'A', 'Female'],
  35. [1002, 'B', 'Male'],
  36. [1003, 'C', 'Male'],
  37. [1004, 'D', 'Female'],
  38. [1005, 'E', 'Female']]
  39. ds = DataSet(data, columns=['ID', 'Name', 'Sex'])
  40. x = DataFrame(data[-2:], columns=['ID', 'Name', 'Sex'])
  41. x_tf = ds.encode(data=x)
  42. # Name is not categorical, because it has unique values
  43. assert x_tf.shape == (2, 3)
  44. assert array_equal(x_tf.columns, ['ID', 'Sex_Female', 'Sex_Male'])
  45. def test_svm_task():
  46. from sklearn.svm import SVC
  47. from sklearn.model_selection import train_test_split
  48. from .testdata import adults01
  49. c_df = DataFrame(adults01)
  50. c_tf = DataSet(c_df).encode()
  51. train, test = train_test_split(c_tf, test_size=0.2)
  52. def make_train_x_y(df):
  53. x_ = df.drop(['salary_<=50K', 'salary_>50K'], axis=1)
  54. # <=50K and >50K are binary, complementary
  55. _, ym_ = df['salary_<=50K'], df['salary_>50K']
  56. return x_, ym_
  57. tr_x, tr_y = make_train_x_y(train)
  58. te_x, te_y = make_train_x_y(test)
  59. clf = SVC(gamma='scale')
  60. clf.fit(tr_x, tr_y)
  61. pr_y = clf.predict(te_x)
  62. from sklearn.metrics import confusion_matrix, classification_report
  63. print(confusion_matrix(te_y, pr_y))
  64. print(classification_report(te_y, pr_y))
  65. def test_synthesize():
  66. dataset = DataSet(adults01)
  67. df = dataset.synthesize()
  68. assert df.size == dataset.size
  69. def test_synthesize_with_pseudonyms():
  70. dataset = DataSet(adults01)
  71. df = dataset.synthesize(pseudonyms=['salary'])
  72. assert df.size == dataset.size
  73. assert array_equal(dataset['salary'].value_counts().values,
  74. df['salary'].value_counts().values)
  75. def test_synthesize_with_retains():
  76. dataset = DataSet(adults01)
  77. df = dataset.synthesize(retains=['age'])
  78. assert df.size == dataset.size
  79. assert array_equal(dataset['age'], df['age'])
  80. def test_synthesize_for_privacy():
  81. # Verify probability after synthesis by differential privacy. (This test
  82. # case may fail because of limit runs.)
  83. from numpy.random import randint
  84. from numpy import exp
  85. epsilon = 0.1
  86. runs = 200
  87. data = randint(65, 90, size=(199, 2))
  88. set1 = DataSet(data.tolist() + [[65, 65]], columns=['ColA', 'ColB'])
  89. set2 = DataSet(data.tolist() + [[65, 66]], columns=['ColA', 'ColB'])
  90. counts = [0, 0]
  91. for i in range(runs):
  92. df1 = set1.synthesize(epsilon=epsilon)
  93. df2 = set2.synthesize(epsilon=epsilon)
  94. counts[0] += ((df1['ColA'] == 65) & (df1['ColB'] == 65)).sum()
  95. counts[1] += ((df2['ColA'] == 65) & (df2['ColB'] == 66)).sum()
  96. assert counts[0] / (runs * 200) <= exp(epsilon) * counts[1] / (runs * 200)