Source code for numpy_datasets.timeseries.audiomnist

import io
import os
import time
from ..utils import download_dataset
import zipfile

import numpy as np
from scipy.io.wavfile import read as wav_read
from tqdm import tqdm


_dataset = "audiomnist"
_urls = {"https://github.com/soerenab/AudioMNIST/archive/master.zip": "data.zip"}


[docs]def load(path=None):
    """
    digit recognition
        https://github.com/soerenab/AudioMNIST

    A simple audio/speech dataset consisting of recordings of spoken digits in
    wav files at 48kHz. The recordings are trimmed so that they have near
    minimal silence at the beginnings and ends.

    FSDD is an open dataset, which means it will grow over time as data is
    contributed. In order to enable reproducibility and accurate citation the
    dataset is versioned using Zenodo DOI as well as git tags.

    Current status

        4 speakers
        2,000 recordings (50 of each digit per speaker)
        English pronunciations
    """
    if path is None:
        path = os.environ["DATASET_PATH"]
    download_dataset(path, _dataset, _urls)

    t0 = time.time()

    # load wavs
    f = zipfile.ZipFile(os.path.join(path, _dataset, "data.zip"))
    wavs = list()
    digits = list()
    speakers = list()
    N = 0
    for filename in tqdm(f.namelist(), ascii=True):
        if ".wav" not in filename:
            continue
        filename_end = filename.split("/")[-1]
        digits.append(int(filename_end.split("_")[0]))
        speakers.append(int(filename_end.split("_")[1]) - 1)
        wavfile = f.read(filename)
        byt = io.BytesIO(wavfile)
        wavs.append(wav_read(byt)[1].astype("float32"))
        N = max(N, len(wavs[-1]))

    digits = np.array(digits)
    speakers = np.array(speakers)
    all_wavs = np.zeros((len(wavs), N))
    for i in range(len(wavs)):
        left = (N - len(wavs[i])) // 2
        all_wavs[i, left : left + len(wavs[i])] = wavs[i]
    print("Audio-MNIST loaded in {} s.".format(time.time() - t0))
    return all_wavs, digits, speakers