compactify.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. """
  2. compactify.py
  3. Shortens the length of an audio file by removing silence and increasing the speed.
  4. Note: Uses pydub library which loads into memory (only small files have been tested)
  5. Other methods certainly can improve upon this naive attempt, especially since I have little
  6. experience in audio signal processing. However, it works for my intended purposes. You might need
  7. to play with the threshold value or window length.
  8. The current method is to apply A-weighting to the signal to help identify silence better.
  9. We then use the Hilbert transform on the filtered samples to get an envelope which we compare
  10. against a threshold value. More information here:
  11. https://www.mathworks.com/help/dsp/examples/envelope-detection.html
  12. Brandon Sachtleben
  13. TODO:
  14. * Handle more diverse cases such as background noise or multiple sources of noise.
  15. * Improve performance.
  16. * Rewrite without pydub if possible (I had some issues with reading using scipy.wavfile)
  17. """
  18. import os
  19. import sys
  20. if len(sys.argv) not in [2, 3]:
  21. print("Usage: python compactify.py [audio filename] [threshold value (optional)]")
  22. sys.exit(1)
  23. if not os.path.isfile(sys.argv[1]):
  24. raise Exception("Cannot find file")
  25. from scipy.signal import filtfilt
  26. from scipy.signal import bilinear
  27. from scipy.signal import hilbert
  28. import numpy as np
  29. from numpy import pi, polymul
  30. from pydub import AudioSegment
  31. import matplotlib.pyplot as plt
  32. # Progress bar
  33. from tqdm import tqdm
  34. def A_weight(fs):
  35. """
  36. Coefficients and formula based on: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4331191/
  37. """
  38. o = 2*pi*np.array([20.598997, 107.65265, 737.86223, 12194.217])
  39. G = -2.0
  40. num = [G*o[3]**2, 0, 0, 0, 0]
  41. denom = polymul(polymul(polymul([1, o[0]], [1, o[0]]), polymul([1, o[3]], [1, o[3]])),
  42. polymul([1, o[1]], [1, o[2]]))
  43. return bilinear(num, denom, fs)
  44. def plot_signal(signal, fs):
  45. plt.plot(np.linspace(0, len(signal)/fs, len(signal)), signal)
  46. class Audio:
  47. def __init__(self, filename):
  48. # Load the audio file
  49. self.audio = AudioSegment.from_file(filename)
  50. # Get the sample rate and numpy array of the sound data
  51. self.fs = self.audio.frame_rate
  52. self.types = [np.uint8, np.int16, np.int32, np.int32]
  53. x = np.fromstring(self.audio._data, self.types[self.audio.sample_width - 1])
  54. temp = []
  55. for ch in list(range(self.audio.channels)):
  56. temp.append(x[ch::self.audio.channels])
  57. self.data = np.array(temp).T
  58. self.data = self.data.flatten()
  59. # Parameters
  60. self.window_length = 100
  61. self.threshold = int(sys.argv[2]) if len(sys.argv) == 3 else 10000000
  62. def remove_silence(self, plot = False):
  63. # Progress bar
  64. pbar_step = len(self.data)
  65. pbar_total = 5*pbar_step
  66. pbar = tqdm(total = pbar_total)
  67. # Plot 1 - unmodified original audio
  68. plt.subplot(3, 1, 1)
  69. plot_signal(self.data, self.fs)
  70. plt.title("Original audio")
  71. # Apply A-weighting first
  72. b, a = A_weight(self.fs)
  73. y = filtfilt(b, a, self.data)
  74. pbar.update(pbar_step)
  75. # Plot 2 - A-weighting applied to samples
  76. plt.subplot(3, 1, 2)
  77. plot_signal(y, self.fs)
  78. plt.title("A-weighted")
  79. # Get an envelope
  80. analytic_signal = hilbert(y)
  81. y_env = np.abs(analytic_signal)
  82. pbar.update(pbar_step)
  83. # Plot 3 - envelope
  84. plt.subplot(3, 1, 3)
  85. plot_signal(y_env, self.fs)
  86. plt.title("Envelope")
  87. if plot:
  88. plt.show()
  89. plt.savefig("{0:s}_processed.png".format(sys.argv[1][0:-4]))
  90. plt.close()
  91. segments = []
  92. # Get non-silent segments
  93. for i in range(0, len(y_env), self.window_length):
  94. Y = y_env[i:i+self.window_length+1]
  95. mean = Y.mean()*(1 + int((i-self.window_length) in segments)*0.5)
  96. if mean > self.threshold:
  97. segments.append(i)
  98. pbar.update(self.window_length * int(i > 0))
  99. pbar.update(len(y_env) - self.window_length * np.floor(len(y_env)/self.window_length))
  100. # Plot for showing regions detected that have audio above threshold value
  101. fig, ax = plt.subplots()
  102. fig.set_size_inches(15, 6, forward=True)
  103. ax.plot(np.linspace(0, len(self.data)/self.fs, len(self.data)), self.data)
  104. start_seg = segments[0]
  105. is_start_seg = True
  106. # Plot regions of audio above threshold (There is certainly a more elegant way to do this.)
  107. for i in range(0, len(segments)):
  108. if (i < len(segments)-1):
  109. # marks the end of a segment
  110. if is_start_seg and (segments[i+1]/self.fs-segments[i]/self.fs) > 0.13:
  111. plt.axvspan(start_seg/self.fs, segments[i]/self.fs, facecolor='g', alpha=0.5)
  112. is_start_seg = False
  113. # marks the start of a segment
  114. elif not is_start_seg and (segments[i+1]/self.fs-segments[i]/self.fs) <= 0.1:
  115. start_seg = segments[i]
  116. is_start_seg = True
  117. else:
  118. if is_start_seg:
  119. plt.axvspan(start_seg/self.fs, segments[i]/self.fs, facecolor='g', alpha=0.5)
  120. is_start_seg = False
  121. pbar.update(pbar_step)
  122. plt.title("Detected silence")
  123. if plot:
  124. plt.show()
  125. plt.savefig("{0:s}_segments.png".format(sys.argv[1][0:-4]))
  126. plt.close()
  127. # Splice data segments
  128. out = np.array([], dtype=self.types[self.audio.sample_width - 1])
  129. for i in segments:
  130. out = np.append(out, self.data[i:i+self.window_length])
  131. # Final plot showing truncated output
  132. plot_signal(out, self.fs)
  133. plt.title("Truncated audio")
  134. plt.savefig("{0:s}_trunc.png".format(sys.argv[1][0:-4]))
  135. pbar.update(pbar_total - pbar.n)
  136. pbar.close()
  137. return out
  138. def export(self, filename, data):
  139. self.audio._data = data
  140. self.audio.export(filename, format='wav')
  141. audio = Audio(sys.argv[1])
  142. data = audio.remove_silence(plot = True)
  143. strOut = "{0:s}_cut.wav".format(sys.argv[1][0:-4])
  144. audio.export(strOut, data)