utils_10.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. def extract_codebook(dataset='train'):
  2. f = open(data_path + dataset + '_list.txt', 'r')
  3. i = 0
  4. for file_name in f:
  5. i = i + 1
  6. if not (i % 10):
  7. print(i)
  8. # load audio file
  9. file_name = file_name.rstrip('\n')
  10. file_path = data_path + file_name
  11. # #print file_path
  12. y0, sr = librosa.load(file_path, sr=22050)
  13. # we use first 1 second
  14. half = len(y0) / 4
  15. y = y0[:round(half)]
  16. # STFT
  17. S_full, phase = librosa.magphase(librosa.stft(y, n_fft=1024, window='hann', hop_length=256, win_length=1024))
  18. n = len(y)
  19. # Check the shape of matrix: row must corresponds to the example index !!!
  20. X = S_full.T
  21. # codebook by using K-Means Clustering
  22. K = 20
  23. kmeans = KMeans(n_clusters=K, random_state=0).fit(X)
  24. features_kmeans = np.zeros(X.shape[0])
  25. # for each sample, summarize feature!!!
  26. codebook = np.zeros(K)
  27. for sample in range(X.shape[0]):
  28. features_kmeans[sample] = kmeans.labels_[sample]
  29. # codebook histogram!
  30. unique, counts = np.unique(features_kmeans, return_counts=True)
  31. for u in unique:
  32. u = int(u)
  33. codebook[u] = counts[u]
  34. # save mfcc as a file
  35. file_name = file_name.replace('.wav', '.npy')
  36. save_file = codebook_path + file_name
  37. if not os.path.exists(os.path.dirname(save_file)):
  38. os.makedirs(os.path.dirname(save_file))
  39. np.save(save_file, codebook)
  40. f.close()