Source code for numpy_datasets.images.celeb

from tqdm import tqdm
import matplotlib.image as mpimg
import zipfile
import numpy as np
import os
import time
import io
from typing import Any, Callable, List, Iterable, Optional, TypeVar

_CITATION = """\
@inproceedings{conf/iccv/LiuLWT15,
  added-at = {2018-10-09T00:00:00.000+0200},
  author = {Liu, Ziwei and Luo, Ping and Wang, Xiaogang and Tang, Xiaoou},
  biburl = {https://www.bibsonomy.org/bibtex/250e4959be61db325d2f02c1d8cd7bfbb/dblp},
  booktitle = {ICCV},
  crossref = {conf/iccv/2015},
  ee = {http://doi.ieeecomputersociety.org/10.1109/ICCV.2015.425},
  interhash = {3f735aaa11957e73914bbe2ca9d5e702},
  intrahash = {50e4959be61db325d2f02c1d8cd7bfbb},
  isbn = {978-1-4673-8391-2},
  keywords = {dblp},
  pages = {3730-3738},
  publisher = {IEEE Computer Society},
  timestamp = {2018-10-11T11:43:28.000+0200},
  title = {Deep Learning Face Attributes in the Wild.},
  url = {http://dblp.uni-trier.de/db/conf/iccv/iccv2015.html#LiuLWT15},
  year = 2015
}
"""


def download(path: str) -> None:
    if not os.path.exists(path):
        os.mkdir(path)
    if not os.path.exists(os.path.join(path, "celeba-dataset.zip")):
        cwd = os.getcwd()
        os.chdir(path)
        os.system("kaggle datasets download -d jessicali9530/celeba-dataset")
        os.chdir(cwd)


[docs]def load(path=None):
    """face images with attributes
    CelebFaces Attributes Dataset (CelebA) is a large-scale face attributes dataset
     with more than 200K celebrity images, each with 40 attribute annotations. The
    images in this dataset cover large pose variations and background clutter.
    CelebA has large diversities, large quantities, and rich annotations, including
    - 10,177 number of identities,
    - 202,599 number of face images, and
    - 5 landmark locations, 40 binary attributes annotations per image.
    The dataset can be employed as the training and test sets for the following
    computer vision tasks: face attribute recognition, face detection, and landmark
     (or facial part) localization.
    Note: CelebA dataset may contain potential bias. The fairness indicators
    `https://github.com/tensorflow/fairness-indicators/blob/master/fairness_indicators/documentation/examples/Fairness_Indicators_TFCO_CelebA_Case_Study.ipynb`
    goes into detail about several considerations to keep in mind while using the
    CelebA dataset.

    Parameters
    ----------

    path: str (optional)
        default ($DATASET_PATH), the path to look for the data and
        where the data will be downloaded if not present

    Returns
    -------

    train_images: array

    train_labels: array

    valid_images: array

    valid_labels: array

    test_images: array

    test_labels: array
    """

    if path is None:
        path = os.environ["DATASET_PATH"]

    download(os.path.join(path, "celebA"))

    t0 = time.time()

    archive = zipfile.ZipFile(os.path.join(path, "celebA", "celeba-dataset.zip"), "r")
    images = []
    ids = []
    for name in tqdm(archive.namelist()):
        if "jpg" in name:
            images.append(mpimg.imread(archive.open(name), "jpg"))

    atts = np.loadtxt(
        archive.open("list_attr_celeba.csv"),
        delimiter=",",
        dtype=str,
    )
    names = atts[0, 1:]

    # list_bbox_celeba.csv
    # list_eval_partition.csv
    # list_landmarks_align_celeba.csv

    print("Dataset celebA loaded in {0:.2f}s.".format(time.time() - t0))
    dataset = {
        "images": np.array(images),
        "attributes": atts[1:, 1:].astype("float32"),
        "names": names,
    }
    return dataset