Source code for numpy_datasets.timeseries.sonycust

import os
import numpy as np
import tarfile
from tqdm import tqdm
from scipy.io.wavfile import read as wav_read
from ..utils import download_dataset


_urls = {
    "https://zenodo.org/record/3233082/files/audio-dev.tar.gz?download=1": "audio-dev.tar.gz",
    "https://zenodo.org/record/3233082/files/annotations-dev.csv?download=1": "annotations-dev.csv",
}


coarse_labels = [
    "engine",
    "machinery-impact",
    "non-machinery-impact" "powered-saw",
    "alert-signal",
    "music",
    "human-voice",
    "dog",
]

fine_labels = [
    "small-sounding-engine",
    "medium-sounding-engine",
    "large-sounding-engine",
    "engine-of-uncertain-size",
    "rock-drill",
    "jackhammer",
    "hoe-ram",
    "pile-driver",
    "other-unknown-impact-machinery",
    "non-machinery-impact",
    "chainsaw",
    "small-medium-rotating-saw",
    "large-rotating-saw",
    "other-unknown-powered-saw",
    "car-horn",
    "car-alarm",
    "siren",
    "reverse-beeper",
    "other-unknown-alert-signal",
    "stationary-music",
    "mobile-music",
    "ice-cream-truck",
    "music-from-uncertain-source",
    "person-or-small-group-talking",
    "person-or-small-group-shouting",
    "large-crowd",
    "amplified-speech",
    "other-unknown-human-voice",
    "dog-barking-whining",
]


[docs]def load(path=None):
    """multilabel urban sound classification

    Reference at https://zenodo.org/record/3233082

    Description

    SONYC Urban Sound Tagging (SONYC-UST) is a dataset for the development and
    evaluation of machine listening systems for realistic urban noise
    monitoring. The audio was recorded from the SONYC acoustic sensor network.
    Volunteers on the Zooniverse citizen science platform tagged the presence
    of 23 classes that were chosen in consultation with the New York City
    Department of Environmental Protection. These 23 fine-grained classes can be
    grouped into 8 coarse-grained classes. The recordings are split into three
    subsets: training, validation, and test. These sets are disjoint with
    respect to the sensor from which each recording came. For increased
    reliability, three volunteers annotated each recording, and members of the
    SONYC team subsequently created a set of ground-truth tags for the
    validation set using a two-stage annotation procedure in which two
    annotators independently tagged and then collectively resolved any
    disagreements. For more details on the motivation and creation of this
    dataset see the DCASE 2019 Urban Sound Tagging Task website.

    Audio data

    The provided audio has been acquired using the SONYC acoustic sensor network
    for urban noise pollution monitoring. Over 50 different sensors have been
    deployed in New York City, and these sensors have collectively gathered the
    equivalent of 37 years of audio data, of which we provide a small subset.
    The data was sampled by selecting the nearest neighbors on VGGish features
    of recordings known to have classes of interest. All recordings are 10
    seconds and were recorded with identical microphones at identical gain
    settings. To maintain privacy, the recordings in this release have been
    distributed in time and location, and the time and location of the
    recordings are not included in the metadata.

    Labels

    there are fine and coarse labels
    engine
    1: small-sounding-engine
    2: medium-sounding-engine
    3: large-sounding-engine
    X: engine-of-uncertain-size
    machinery-impact
    1: rock-drill
    2: jackhammer
    3: hoe-ram
    4: pile-driver
    X: other-unknown-impact-machinery
    non-machinery-impact
    1: non-machinery-impact
    powered-saw
    1: chainsaw
    2: small-medium-rotating-saw
    3: large-rotating-saw
    X: other-unknown-powered-saw
    alert-signal
    1: car-horn
    2: car-alarm
    3: siren
    4: reverse-beeper
    X: other-unknown-alert-signal
    music
    1: stationary-music
    2: mobile-music
    3: ice-cream-truck
    X: music-from-uncertain-source
    human-voice
    1: person-or-small-group-talking
    2: person-or-small-group-shouting
    3: large-crowd
    4: amplified-speech
    X: other-unknown-human-voice
    dog
    1: dog-barking-whining

    """

    if path is None:
        path = os.environ["DATASET_PATH"]

    download_dataset(path, "irmas", _urls)

    # Loading the file
    files = tarfile.open(path + "ust/audio-dev.tar.gz", "r:gz")
    annotations = np.loadtxt(
        path + "ust/annotations-dev.csv",
        delimiter=",",
        skiprows=1,
        dtype="str",
    )

    # get name
    filenames = list(annotations[:, 2])
    for i in range(len(filenames)):
        filenames[i] = annotations[i, 0] + "/" + str(filenames[i])

    # get fine labels and limts for coarse classes
    fine_labels = annotations[:, 4:33].astype("float32").astype("int32")
    class_limits = [0, 4, 9, 10, 14, 19, 23, 28, 29]
    n_classes = len(class_limits) - 1
    n_samples = len(annotations)
    llabels = np.zeros((n_samples, n_classes), dtype="int")
    for k in range(n_classes):
        block = fine_labels[:, class_limits[k] : class_limits[k + 1]]
        llabels[:, k] = block.max(1)

    wavs = np.zeros((2794, 441000), dtype="float32")
    coarse = np.zeros((2794, 8), dtype="int32")
    fine = np.zeros((2794, 29), dtype="int32")
    filenames = files.getnames()
    cpt = 0
    for name in tqdm(filenames, ascii=True):
        if ".wav" not in name:
            continue
        wavs[cpt] = wav_read(files.extractfile(name))[1].astype("float32")
        coarse[cpt] = llabels[filenames.index(name)]
        fine[cpt] = fine_labels[filenames.index(name)]
        cpt += 1

    data = {"wavs": wavs, "fine_labels": fine, "coarse_labels": coarse}
    return data