graph_manager.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. # import sys
  2. # sys.path.append('..')
  3. # from util.mlflow_util import load_uri, get_prev_run
  4. from .mlflow_util import load_uri, get_prev_run
  5. import numpy as np
  6. import scipy.sparse as sps
  7. import scipy.linalg as sla
  8. import os
  9. import mlflow
  10. from sklearn.neighbors import NearestNeighbors
  11. METRICS = ['euclidean', 'cosine']
  12. class GraphManager(object):
  13. """
  14. Mostly Graph methods
  15. Necessary Fields:
  16. self.param
  17. Optional Fields:
  18. """
  19. def __init__(self):
  20. return
  21. def from_features(self, X, knn=15, sigma=3., normalized=True, n_eigs=None, zp_k=None, metric='euclidean', debug=False):
  22. """
  23. load from features using params
  24. params:
  25. knn
  26. sigma
  27. normalized
  28. n_eigs
  29. zp_k
  30. """
  31. assert metric in METRICS
  32. print("Computing (or retrieving) graph evals and evecs with parameters:")
  33. print("\tN = {}, knn = {}, sigma = {}".format(X.shape[0], knn, sigma))
  34. print("\tnormalized = {}, n_eigs = {}, zp_k = {}".format(normalized, n_eigs, zp_k))
  35. print("\tmetric = {}".format(metric))
  36. print()
  37. if not debug:
  38. params = {
  39. 'knn' : knn,
  40. 'sigma' : sigma,
  41. 'normalized' : normalized,
  42. 'n_eigs' : n_eigs,
  43. 'zp_k' : zp_k,
  44. 'metric' : metric
  45. }
  46. prev_run = get_prev_run('GraphManager.from_features',
  47. params,
  48. tags={"X":str(X), "N":str(X.shape[0])},
  49. git_commit=None)
  50. if prev_run is not None:
  51. print('Found previous eigs')
  52. eigs = load_uri(os.path.join(prev_run.info.artifact_uri,
  53. 'eigs.npz'))
  54. return eigs['w'], eigs['v']
  55. print('Did not find previous eigs, computing from scratch...')
  56. W = self.compute_similarity_graph(X=X, knn=knn, sigma=sigma,
  57. zp_k=zp_k, metric=metric)
  58. L = self.compute_laplacian(W, normalized=normalized)
  59. w, v = self.compute_spectrum(L, n_eigs=n_eigs)
  60. L = sps.csr_matrix(L)
  61. self.W = W
  62. if debug:
  63. return w, v
  64. with mlflow.start_run(nested=True):
  65. np.savez('./tmp/eigs.npz', w=w, v=v)
  66. sps.save_npz('./tmp/W.npz', W)
  67. mlflow.set_tag('function', 'GraphManager.from_features')
  68. mlflow.set_tag('X', str(X))
  69. mlflow.set_tag('N', str(X.shape[0]))
  70. mlflow.log_params(params)
  71. mlflow.log_artifact('./tmp/eigs.npz')
  72. mlflow.log_artifact('./tmp/W.npz')
  73. os.remove('./tmp/eigs.npz')
  74. os.remove('./tmp/W.npz')
  75. return w, v
  76. def compute_similarity_graph(self, X, knn=15, sigma=3., zp_k=None, metric='euclidean', maxN=5000):
  77. """
  78. Computes similarity graph using parameters specified in self.param
  79. """
  80. N = X.shape[0]
  81. if knn is None:
  82. if N < maxN:
  83. knn = N
  84. else:
  85. print("Parameter knn was given None and N > maxN, so setting knn=15")
  86. knn = 15
  87. if N < maxN:
  88. print("Calculating NN graph with SKLEARN NearestNeighbors...")
  89. if knn > N / 2:
  90. nn = NearestNeighbors(n_neighbors=knn, algorithm='brute').fit(X)
  91. else:
  92. nn = NearestNeighbors(n_neighbors=knn, algorithm='ball_tree').fit(X)
  93. # construct CSR matrix representation of the k-NN graph
  94. A_data, A_ind = nn.kneighbors(X, knn, return_distance=True)
  95. else:
  96. print("Calculating NN graph with NNDescent package since N = {} > {}".format(N, maxN))
  97. from pynndescent import NNDescent
  98. index = NNDescent(X, metric=metric)
  99. index.prepare()
  100. A_ind, A_data = index.query(X, k=knn)
  101. # modify from the kneighbors_graph function from sklearn to
  102. # accomodate Zelnik-Perona scaling
  103. n_nonzero = N * knn
  104. A_indptr = np.arange(0, n_nonzero + 1, knn)
  105. if zp_k is not None and metric == 'euclidean':
  106. k_dist = A_data[:,zp_k][:,np.newaxis]
  107. k_dist[k_dist < 1e-4] = 1e-4
  108. A_data /= np.sqrt(k_dist * k_dist[A_ind,0])
  109. A_data = np.ravel(A_data)
  110. if metric == 'cosine':
  111. print(np.max(A_data))
  112. W = sps.csr_matrix(((1.-A_data), # need to do 1.-A_data since NNDescent returns cosine DISTANCE (1. - cosine_similarity)
  113. A_ind.ravel(),
  114. A_indptr),
  115. shape=(N, N))
  116. else:
  117. W = sps.csr_matrix((np.exp(-(A_data ** 2)/sigma),
  118. A_ind.ravel(),
  119. A_indptr),
  120. shape=(N, N))
  121. W = (W + W.T)/2
  122. #W = max(W, W.T)
  123. W.setdiag(0)
  124. W.eliminate_zeros()
  125. return W
  126. def compute_laplacian(self, W, normalized=True):
  127. """
  128. Computes the graph Laplacian using parameters specified in self.params
  129. """
  130. if normalized:
  131. L = sps.csgraph.laplacian(W, normed=True)
  132. else:
  133. L = sps.csgraph.laplacian(W, normed=False)
  134. return L
  135. def compute_spectrum(self, L, n_eigs):
  136. """
  137. Computes first n_eigs smallest eigenvalues and eigenvectors
  138. """
  139. N = L.shape[0]
  140. if n_eigs is None:
  141. n_eigs = N
  142. if n_eigs > int(N/2):
  143. w, v = sla.eigh(L.toarray(), eigvals=(0,n_eigs-1))
  144. else:
  145. w, v = sps.linalg.eigsh(L, k=n_eigs, which='SM')
  146. return w, v