metrics.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. """
  2. Metrics for data synthesis
  3. """
  4. import pandas as pd
  5. import numpy as np
  6. from sklearn.metrics import normalized_mutual_info_score, confusion_matrix
  7. def pairwise_mutual_information(frame: pd.DataFrame):
  8. """
  9. Return mutual information matrix for pairwise columns of a DataFrame.
  10. """
  11. columns = frame.columns.sort_values().to_list()
  12. mi = pd.DataFrame(columns=columns, index=columns, dtype=float)
  13. for row in columns:
  14. for col in columns:
  15. if pd.isnull(mi.at[col, row]):
  16. mi.at[row, col] = normalized_mutual_info_score(frame[row],
  17. frame[col])
  18. else:
  19. mi.at[row, col] = mi.at[col, row]
  20. return mi.round(3)
  21. def jensen_shannon_divergence(p, q, base=2):
  22. """
  23. Return the Jensen-Shannon divergence between two 1-D arrays.
  24. Parameters
  25. ---------
  26. p : array
  27. left probability array
  28. q : array
  29. right probability array
  30. base : numeric, default 2
  31. logarithm base
  32. Returns
  33. -------
  34. jsd : float
  35. divergence of p and q
  36. """
  37. # If the sum of probability array p or q does not equal to 1, then normalize
  38. p = np.asarray(p)
  39. q = np.asarray(q)
  40. p = p / np.sum(p, axis=0)
  41. q = q / np.sum(q, axis=0)
  42. from scipy.spatial.distance import jensenshannon
  43. return round(jensenshannon(p, q, base=base), 4)
  44. def error_rate(y_true, y_pred=None):
  45. """
  46. Return error (mis-classification) rate of one classifier result
  47. If there is only one parameter, it must be the confusion matrix;
  48. If there are two parameters, they must be true and predict labels;
  49. """
  50. if y_pred is None:
  51. if isinstance(y_true, pd.DataFrame):
  52. cm = y_true.values
  53. else:
  54. cm = y_true
  55. else:
  56. cm = confusion_matrix(y_true, y_pred)
  57. trace = np.trace(cm)
  58. sum_ = np.sum(cm)
  59. return round((sum_ - trace) / sum_, 4)
  60. def relative_error(x, y):
  61. """
  62. Return relative error of two variables: |x-y|/max(|x|, |y|)
  63. """
  64. m = np.maximum(np.abs(x), np.abs(y))
  65. return round(np.average(np.abs(x - y) / (m + 1e-6)), 4)