LiuFan
/
PrivacyScanData


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
							"""

compactify.py


Shortens the length of an audio file by removing silence and increasing the speed.


Note: Uses pydub library which loads into memory (only small files have been tested)


Other methods certainly can improve upon this naive attempt, especially since I have little

experience in audio signal processing. However, it works for my intended purposes. You might need

to play with the threshold value or window length.


The current method is to apply A-weighting to the signal to help identify silence better.

We then use the Hilbert transform on the filtered samples to get an envelope which we compare

against a threshold value. More information here:

https://www.mathworks.com/help/dsp/examples/envelope-detection.html


Brandon Sachtleben


TODO:

* Handle more diverse cases such as background noise or multiple sources of noise.

* Improve performance.

* Rewrite without pydub if possible (I had some issues with reading using scipy.wavfile)

"""

import os
import sys

if len(sys.argv) not in [2, 3]:
    print("Usage: python compactify.py [audio filename] [threshold value (optional)]")
    sys.exit(1)

if not os.path.isfile(sys.argv[1]):
    raise Exception("Cannot find file")

from scipy.signal import filtfilt
from scipy.signal import bilinear
from scipy.signal import hilbert
import numpy as np
from numpy import pi, polymul
from pydub import AudioSegment
import matplotlib.pyplot as plt

# Progress bar
from tqdm import tqdm

def A_weight(fs):
    """

    Coefficients and formula based on: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4331191/

    """
    o = 2*pi*np.array([20.598997, 107.65265, 737.86223, 12194.217])
    G = -2.0

    num = [G*o[3]**2, 0, 0, 0, 0]
    denom = polymul(polymul(polymul([1, o[0]], [1, o[0]]), polymul([1, o[3]], [1, o[3]])),
                    polymul([1, o[1]], [1, o[2]]))

    return bilinear(num, denom, fs)

def plot_signal(signal, fs):
    plt.plot(np.linspace(0, len(signal)/fs, len(signal)), signal)

class Audio:
    def __init__(self, filename):
        # Load the audio file
        self.audio = AudioSegment.from_file(filename)

        # Get the sample rate and numpy array of the sound data
        self.fs = self.audio.frame_rate
        self.types = [np.uint8, np.int16, np.int32, np.int32]
        x = np.fromstring(self.audio._data, self.types[self.audio.sample_width - 1])
        temp = []
        for ch in list(range(self.audio.channels)):
            temp.append(x[ch::self.audio.channels])
        self.data = np.array(temp).T
        self.data = self.data.flatten()

        # Parameters
        self.window_length = 100
        self.threshold = int(sys.argv[2]) if len(sys.argv) == 3 else 10000000

    def remove_silence(self, plot = False):
        # Progress bar
        pbar_step = len(self.data)
        pbar_total = 5*pbar_step
        pbar = tqdm(total = pbar_total)

        # Plot 1 - unmodified original audio
        plt.subplot(3, 1, 1)
        plot_signal(self.data, self.fs)
        plt.title("Original audio")

        # Apply A-weighting first
        b, a = A_weight(self.fs)
        y = filtfilt(b, a, self.data)

        pbar.update(pbar_step)

        # Plot 2 - A-weighting applied to samples
        plt.subplot(3, 1, 2)
        plot_signal(y, self.fs)
        plt.title("A-weighted")

        # Get an envelope
        analytic_signal = hilbert(y)
        y_env = np.abs(analytic_signal)

        pbar.update(pbar_step)

        # Plot 3 - envelope
        plt.subplot(3, 1, 3)
        plot_signal(y_env, self.fs)
        plt.title("Envelope")

        if plot:
            plt.show()

        plt.savefig("{0:s}_processed.png".format(sys.argv[1][0:-4]))
        plt.close()

        segments = []

        # Get non-silent segments
        for i in range(0, len(y_env), self.window_length):
            Y = y_env[i:i+self.window_length+1]

            mean = Y.mean()*(1 + int((i-self.window_length) in segments)*0.5)
            if mean > self.threshold:
                segments.append(i)

            pbar.update(self.window_length * int(i > 0))

        pbar.update(len(y_env) - self.window_length * np.floor(len(y_env)/self.window_length))

        # Plot for showing regions detected that have audio above threshold value
        fig, ax = plt.subplots()
        fig.set_size_inches(15, 6, forward=True)
        ax.plot(np.linspace(0, len(self.data)/self.fs, len(self.data)), self.data)

        start_seg = segments[0]
        is_start_seg = True

        # Plot regions of audio above threshold (There is certainly a more elegant way to do this.)
        for i in range(0, len(segments)):
            if (i < len(segments)-1):
                # marks the end of a segment
                if is_start_seg and (segments[i+1]/self.fs-segments[i]/self.fs) > 0.13:
                    plt.axvspan(start_seg/self.fs, segments[i]/self.fs, facecolor='g', alpha=0.5)
                    is_start_seg = False
                # marks the start of a segment
                elif not is_start_seg and (segments[i+1]/self.fs-segments[i]/self.fs) <= 0.1:
                    start_seg = segments[i]
                    is_start_seg = True
            else:
                if is_start_seg:
                    plt.axvspan(start_seg/self.fs, segments[i]/self.fs, facecolor='g', alpha=0.5)
                    is_start_seg = False

        pbar.update(pbar_step)

        plt.title("Detected silence")

        if plot:
            plt.show()

        plt.savefig("{0:s}_segments.png".format(sys.argv[1][0:-4]))
        plt.close()

        # Splice data segments
        out = np.array([], dtype=self.types[self.audio.sample_width - 1])

        for i in segments:
            out = np.append(out, self.data[i:i+self.window_length])

        # Final plot showing truncated output
        plot_signal(out, self.fs)
        plt.title("Truncated audio")
        plt.savefig("{0:s}_trunc.png".format(sys.argv[1][0:-4]))

        pbar.update(pbar_total - pbar.n)
        pbar.close()

        return out

    def export(self, filename, data):
        self.audio._data = data
        self.audio.export(filename, format='wav')

audio = Audio(sys.argv[1])
data = audio.remove_silence(plot = True)
strOut = "{0:s}_cut.wav".format(sys.argv[1][0:-4])
audio.export(strOut, data)