Source code for tinyms.data.utils

# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""TinyMS data utils package."""
import os
import sys
import gzip
import tarfile
import requests
import wget
from PIL import Image
import numpy as np
from tinyms import Tensor

__all__ = ['download_dataset', 'generate_image_list', 'load_resized_img', 'load_img']

IMG_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.tif', '.tiff']


def is_image(filename):
    """
    Judge whether it is a picture.

    Args:
        filename (str): image name.

    Returns:
        bool, True or False.

    """
    return any(filename.lower().endswith(extension) for extension in IMG_EXTENSIONS)


[docs]def generate_image_list(dir_path, max_dataset_size=float("inf")): """ Traverse the directory to generate a list of images path. Args: dir_path (str): image directory. max_dataset_size (int): Maximum number of return image paths. Returns: Image path list. """ images = [] assert os.path.isdir(dir_path), '%s is not a valid directory' % dir_path for root, _, fnames in sorted(os.walk(dir_path)): for fname in fnames: if is_image(fname): path = os.path.join(root, fname) images.append(path) print("len(images):", len(images)) return images[:min(max_dataset_size, len(images))]
def _unzip(gzip_path): """unzip dataset file Args: gzip_path: dataset file path """ # decompress the file if gzip_path ends with `.tar` if gzip_path.endswith('.tar'): with tarfile.open(gzip_path) as f: f.extractall(gzip_path[:gzip_path.rfind('/')]) elif gzip_path.endswith('.gz'): gzip_file = gzip_path.replace('.gz', '') with open(gzip_file, 'wb') as f: gz_file = gzip.GzipFile(gzip_path) f.write(gz_file.read()) # decompress the file if gz_file ends with `.tar` if gzip_file.endswith('.tar'): with tarfile.open(gzip_file) as f: f.extractall(gzip_file[:gzip_file.rfind('/')]) else: print("Currently the format of unzip dataset only supports `*.tar`, `*.gz` and `*.tar.gz`!") sys.exit(0) def _fetch_and_unzip(url, file_name): """download the dataset from remote url Args: url: str, remote download url file_name: str, local path of downloaded file """ res = requests.get(url, stream=True, verify=False) # get dataset size total_size = int(res.headers["Content-Length"]) temp_size = 0 with open(file_name, "wb+") as f: for chunk in res.iter_content(chunk_size=1024): temp_size += len(chunk) f.write(chunk) f.flush() done = int(100 * temp_size / total_size) # show download progress sys.stdout.write("\r[{}{}] {:.2f}%".format("█" * done, " " * (100 - done), 100 * temp_size / total_size)) sys.stdout.flush() print("\n============== {} is ready ==============".format(file_name)) _unzip(file_name) os.remove(file_name) def _fetch_and_unzip_by_wget(url, file_name): """download the dataset from remote url by wget tool Args: url: str, remote download url file_name: str, local path of downloaded file """ # function to show download progress def bar_progress(current, total, width=80): progress_message = "Downloading: %d%% [%d / %d] bytes" % (current / total * 100, current, total) # Don't use print() as it will print in new line every time. sys.stdout.write("\r" + progress_message) sys.stdout.flush() # using wget is faster than fetch. wget.download(url, out=file_name, bar=bar_progress) print("\n============== {} is ready ==============".format(file_name)) _unzip(file_name) os.remove(file_name) def _download_mnist(local_path): """Download the dataset from http://yann.lecun.com/exdb/mnist/.""" dataset_path = os.path.join(local_path, 'mnist') train_path = os.path.join(dataset_path, 'train') test_path = os.path.join(dataset_path, 'test') if not os.path.exists(train_path): os.makedirs(train_path) if not os.path.exists(test_path): os.makedirs(test_path) print("************** Downloading the MNIST dataset **************") train_url = {"http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz", "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz"} test_url = {"http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz", "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"} for url in train_url: # split the file name from url file_name = os.path.join(train_path, url.split('/')[-1]) if not os.path.exists(file_name.replace('.gz', '')): _fetch_and_unzip(url, file_name) for url in test_url: # split the file name from url file_name = os.path.join(test_path, url.split('/')[-1]) if not os.path.exists(file_name.replace('.gz', '')): _fetch_and_unzip(url, file_name) return dataset_path def _download_cifar10(local_path): '''Download the dataset from http://www.cs.toronto.edu/~kriz/cifar.html.''' dataset_path = os.path.join(local_path, 'cifar10') if not os.path.exists(dataset_path): os.makedirs(dataset_path) print("************** Downloading the Cifar10 dataset **************") remote_url = "http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz" file_name = os.path.join(dataset_path, remote_url.split('/')[-1]) if not os.path.exists(file_name.replace('.gz', '')): _fetch_and_unzip(remote_url, file_name) return os.path.join(dataset_path, 'cifar-10-batches-bin') def _download_cifar100(local_path): '''Download the dataset from http://www.cs.toronto.edu/~kriz/cifar.html.''' dataset_path = os.path.join(local_path, 'cifar100') if not os.path.exists(dataset_path): os.makedirs(dataset_path) print("************** Downloading the Cifar100 dataset **************") remote_url = "http://www.cs.toronto.edu/~kriz/cifar-100-binary.tar.gz" file_name = os.path.join(dataset_path, remote_url.split('/')[-1]) if not os.path.exists(file_name.replace('.gz', '')): _fetch_and_unzip(remote_url, file_name) return os.path.join(dataset_path, 'cifar-100-binary') def _download_voc(local_path): '''Download the dataset from http://host.robots.ox.ac.uk/pascal/VOC/voc2007/index.html.''' dataset_path = os.path.join(local_path, 'voc') if not os.path.exists(dataset_path): os.makedirs(dataset_path) print("************** Downloading the VOC2007 dataset **************") remote_url = "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar" file_name = os.path.join(dataset_path, remote_url.split('/')[-1]) if not os.path.exists(os.path.join(dataset_path, 'VOCdevkit', 'VOC2007')): _fetch_and_unzip(remote_url, file_name) return os.path.join(dataset_path, 'VOCdevkit', 'VOC2007') def _check_uncompressed_kaggle_display_advertising_files(dataset_path): """check uncompressed kaggle display advertising files.""" file_name_list = ["train.txt", "test.txt", "readme.txt"] file_size_list = [11147184845, 1460246311, 1927] for file_name, file_size in zip(file_name_list, file_size_list): file_path = os.path.join(dataset_path, file_name) if not os.path.exists(file_path): return False else: if os.path.getsize(file_path) != file_size: print("************** {} may be error, need to download again **************". format(file_path), flush=True) return False return True def _download_kaggle_display_advertising(local_path): '''Download the dataset from http://go.criteo.net/criteo-research-kaggle-display-advertising-challenge-dataset.tar.gz.''' dataset_path = os.path.join(local_path, "kaggle_display_advertising") if not os.path.exists(dataset_path): os.makedirs(dataset_path) print("************** Downloading the Kaggle Display Advertising Challenge dataset **************") remote_url = "http://go.criteo.net/criteo-research-kaggle-display-advertising-challenge-dataset.tar.gz" file_name = os.path.join(dataset_path, remote_url.split('/')[-1]) # already exist criteo-research-kaggle-display-advertising-challenge-dataset.tar.gz if os.path.exists(file_name): if os.path.getsize(file_name) == 4576820670: print("************** Uncompress already exists tar format data **************", flush=True) _unzip(file_name) if not _check_uncompressed_kaggle_display_advertising_files(dataset_path): _fetch_and_unzip_by_wget(remote_url, file_name) else: print("{} already have uncompressed kaggle display advertising dataset.".format(dataset_path), flush=True) return os.path.join(dataset_path) download_checker = { 'mnist': _download_mnist, 'cifar10': _download_cifar10, 'cifar100': _download_cifar100, 'voc': _download_voc, 'kaggle_display_advertising': _download_kaggle_display_advertising, }
[docs]def download_dataset(dataset_name, local_path='.'): r''' This function is defined to easily download any public dataset without specifing much details. Args: dataset_name (str): The official name of dataset, currently supports `mnist`, `cifar10` and `cifar100`. local_path (str): Specifies the local location of dataset to be downloaded. Default: `.`. Returns: str, the source location of dataset downloaded. Examples: >>> from tinyms.data import download_dataset >>> >>> ds_path = download_dataset('mnist') ''' download_func = download_checker.get(dataset_name) if download_func is None: print("Currently dataset_name only supports {}!".format(list(download_checker.keys()))) sys.exit(0) return download_func(local_path)
[docs]def load_resized_img(path, width=256, height=256): """ Load image with RGB and resize to (256, 256). Args: path (str): image path. width (int): image width, default: 256. height (int): image height, default: 256. Returns: PIL image class. """ return Image.open(path).convert('RGB').resize((width, height))
[docs]def load_img(path): """ Load image with RGB. Args: path (str): image path. Returns: PIL image class. """ if path is None or not is_image(path): assert path, '%s is none or is not an image' return Image.open(path).convert('RGB')
def save_image(img, img_path): """ Save a numpy image to the disk. Args: img (Union[numpy.ndarray, Tensor]): Image to save. img_path (str): The path of the image. """ if isinstance(img, Tensor): # Decode a [1, C, H, W] Tensor to image numpy array. mean = 0.5 * 255 std = 0.5 * 255 img = (img.asnumpy()[0] * std + mean).astype(np.uint8).transpose((1, 2, 0)) elif not isinstance(img, np.ndarray): raise ValueError("img should be Tensor or numpy array, but get {}".format(type(img))) img_pil = Image.fromarray(img) img_pil.save(img_path)