Source code for numpy_datasets.timeseries.speech_commands

import os
import urllib.request
import numpy as np
import tarfile
import time
from tqdm import tqdm
from scipy.io.wavfile import read as wav_read
from ..utils import download_dataset

DOC = """speech commands classification

    source: https://ai.googleblog.com/2017/08/launching-speech-commands-dataset.html

    The dataset has 65,000 one-second long utterances of 30 short
    words, by thousands of different people, contributed by
    members of the public through the AIY website. It’s released
    under a Creative Commons BY 4.0 license, and will continue to
    grow in future releases as more contributions are received.
    The dataset is designed to let you build basic but useful
    voice interfaces for applications, with common words like
    “Yes”, “No”, digits, and directions included. The
    infrastructure we used to create the data has been open
    sourced too, and we hope to see it used by the wider community
    to create their own versions, especially to cover underserved
    languages and applications.

    """

name2class = {
    "blues": 0,
    "classical": 1,
    "country": 2,
    "disco": 3,
    "hiphop": 4,
    "jazz": 5,
    "metal": 6,
    "pop": 7,
    "reggae": 8,
    "rock": 9,
}

_dataset = "speech_commands"
_urls = {
    "http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz": "speech_commands_v0.01.tar.gz"
}


[docs]def load(path=None):

    if path is None:
        path = os.environ["DATASET_PATH"]
    download_dataset(path, _dataset, _urls)

    t0 = time.time()

    print("Loading speech command")

    tar = tarfile.open(path + "speech_commands/speech_commands_v0.01.tar.gz", "r:gz")

    # Load train set
    wavs = list()
    labels = list()
    noises = list()
    noise_labels = list()
    names = tar.getmembers()
    for name in tqdm(names, ascii=True):
        if "wav" not in name.name:
            continue
        f = tar.extractfile(name.name)  # .read()
        wav = wav_read(f)[1]
        if "noise" in name.name:
            noises.append(wav)
            noise_labels.append(name.name.split("/")[-1])
        else:
            left = 16000 - len(wav)
            to_pad = left // 2
            wavs.append(np.pad(wav, [[to_pad, left - to_pad]]))
            labels.append(name.name.split("/")[-2])
    labels = np.array(labels)
    unique_labels = np.unique(labels)
    y = np.squeeze(
        np.array([np.nonzero(label == unique_labels)[0] for label in labels]).astype(
            "int32"
        )
    )

    data = {
        "wavs": np.array(wavs).astype("float32"),
        "labels": y,
        "names": labels,
        "noises": noises,
        "noises_labels": noise_labels,
    }

    print("Dataset speech commands loaded in{0:.2f}s.".format(time.time() - t0))

    return data