Skip to content

Semantic segmentation on the Pascal VOC dataset

The following files are also available on GitHub - https://github.com/albumentations-team/autoalbument/tree/master/examples/pascal_voc

dataset.py

import cv2
import numpy as np
from torchvision.datasets import VOCSegmentation


VOC_CLASSES = [
    "background",
    "aeroplane",
    "bicycle",
    "bird",
    "boat",
    "bottle",
    "bus",
    "car",
    "cat",
    "chair",
    "cow",
    "diningtable",
    "dog",
    "horse",
    "motorbike",
    "person",
    "potted plant",
    "sheep",
    "sofa",
    "train",
    "tv/monitor",
]


VOC_COLORMAP = [
    [0, 0, 0],
    [128, 0, 0],
    [0, 128, 0],
    [128, 128, 0],
    [0, 0, 128],
    [128, 0, 128],
    [0, 128, 128],
    [128, 128, 128],
    [64, 0, 0],
    [192, 0, 0],
    [64, 128, 0],
    [192, 128, 0],
    [64, 0, 128],
    [192, 0, 128],
    [64, 128, 128],
    [192, 128, 128],
    [0, 64, 0],
    [128, 64, 0],
    [0, 192, 0],
    [128, 192, 0],
    [0, 64, 128],
]


class SearchDataset(VOCSegmentation):
    def __init__(self, image_set="train", transform=None):
        super().__init__(root="~/data/pascal_voc", image_set=image_set, download=True, transform=transform)

    @staticmethod
    def _convert_to_segmentation_mask(mask):
        # This function converts a mask from the Pascal VOC format to the format required by AutoAlbument.
        #
        # Pascal VOC uses an RGB image to encode the segmentation mask for that image. RGB values of a pixel
        # encode the pixel's class.
        #
        # AutoAlbument requires a segmentation mask to be a NumPy array with the shape [height, width, num_classes].
        # Each channel in this mask should encode values for a single class. Pixel in a mask channel should have
        # a value of 1.0 if the pixel of the image belongs to this class and 0.0 otherwise.
        height, width = mask.shape[:2]
        segmentation_mask = np.zeros((height, width, len(VOC_COLORMAP)), dtype=np.float32)
        for label_index, label in enumerate(VOC_COLORMAP):
            segmentation_mask[:, :, label_index] = np.all(mask == label, axis=-1).astype(float)
        return segmentation_mask

    def __getitem__(self, index):
        image = cv2.imread(self.images[index])
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        mask = cv2.imread(self.masks[index])
        mask = cv2.cvtColor(mask, cv2.COLOR_BGR2RGB)
        mask = self._convert_to_segmentation_mask(mask)
        if self.transform is not None:
            transformed = self.transform(image=image, mask=mask)
            image = transformed["image"]
            mask = transformed["mask"]
        return image, mask

search.yaml

# @package _global_

task: semantic_segmentation

# Settings for Policy Model that searches augmentation policies.
policy_model:

  # Multiplier for segmentation loss of a model. Faster AutoAugment uses segmentation loss to prevent augmentations
  # from transforming images of a particular class to another class.
  task_factor: 0.1

  # Multiplier for the gradient penalty for WGAN-GP training. 10 is the default value that was proposed in
  # `Improved Training of Wasserstein GANs`.
  gp_factor: 10

  # Temperature for Relaxed Bernoulli distribution. The probability of applying a certain augmentation is sampled from
  # Relaxed Bernoulli distribution (because Bernoulli distribution is not differentiable). With lower values of
  # `temperature` Relaxed Bernoulli distribution behaves like Bernoulli distribution. In the paper, the authors
  # of Faster AutoAugment used 0.05 as a default value for `temperature`.
  temperature: 0.05

  # Number of augmentation sub-policies. When an image passes through an augmentation pipeline, Faster AutoAugment
  # randomly chooses one sub-policy and uses augmentations from that sub-policy to transform an input image. A larger
  # number of sub-policies leads to a more diverse set of augmentations and better performance of a model trained on
  # augmented images. However, an increase in the number of sub-policies leads to the exponential growth of a search
  # space of augmentations, so you need more training data for Policy Model to find good augmentation policies.
  num_sub_policies: 40

  # Number of chunks in a batch. Faster AutoAugment splits each batch of images into `num_chunks` chunks. Then it
  # applies the same sub-policy with the same parameters to each image in a chunk. This parameter controls the tradeoff
  # between the speed of augmentation search and diversity of augmentations. Larger `num_chunks` values will lead to
  # faster searching but less diverse set of augmentations. Note that this parameter is used only in the searching
  # phase. When you train a model with found sub-policies, Albumentations will apply a distinct set of transformations
  # to each image separately.
  num_chunks: 4

  # Number of consecutive augmentations in each sub-policy. Faster AutoAugment will sequentially apply `operation_count`
  # augmentations from a sub-policy to an image. Larger values of `operation_count` lead to better performance of
  # a model trained on augmented images. Simultaneously, larger values of `operation_count` affect the speed of search
  # and increase the searching time.
  operation_count: 4


# Settings for Semantic Segmentation Model that is used for two purposes:
# 1. As a model that performs semantic segmentation of input images.
# 2. As a Discriminator for Policy Model.
semantic_segmentation_model:

  # The number of classes in the dataset. The dataset implementation should return a mask as a NumPy array with
  # the shape [height, width, num_classes]. In a case of binary segmentation you can set `num_classes` to 1.
  num_classes: 21

  # The architecture of Semantic Segmentation Model. AutoAlbument uses models from
  # https://github.com/qubvel/segmentation_models.pytorch. Please refer to its documentation to get a list of available
  # models - https://github.com/qubvel/segmentation_models.pytorch#models-.
  architecture: Unet

  # The architecture of encoder in Semantic Segmentation Model. Please refer to Segmentation Models' documentation to
  # get a list of available encoders - https://github.com/qubvel/segmentation_models.pytorch#encoders-
  encoder_architecture: se_resnext50_32x4d

  # Either boolean flag or string with that indicates whether the selected encoder architecture should load pretrained
  # weights or use randomly initialized weights.
  # - In the case of boolean flag `true` means using pretrained weights from ImageNet and `false` means using randomly
  #   initialized weights.
  # - In the case of string the value should specify the name of the weights. For the list of available weights please
  #   refer to https://github.com/qubvel/segmentation_models.pytorch#encoders-
  pretrained: True


data:
  # Class for the PyTorch Dataset and arguments to it. AutoAlbument will create an object of this class using
  # the `instantiate` method from Hydra - https://hydra.cc/docs/next/patterns/instantiate_objects/overview/.
  #
  # Note that the target class value in the `_target_` argument should be located inside PYTHONPATH so Hydra could
  # find it. The directory with the config file is automatically added to PYTHONPATH, so the default value
  # `dataset.SearchDataset` points to the class `SearchDataset` from the `dataset.py` file. This `dataset.py` file is
  # located along with the `search.yaml` file in the same directory provided by `--config-dir`.
  #
  # As an alternative, you could provide a path to a Python file with the dataset using the `dataset_file` parameter
  # instead of the `dataset` parameter. The Python file should contain the implementation of a PyTorch dataset for
  # augmentation search. The dataset class should have named `SearchDataset`. The value in `dataset_file` could either
  # be a relative or an absolute path ; in the case of a relative path, the path should be relative to this config
  # file's location.
  #
  # - Example of a relative path:
  # dataset_file: dataset.py
  #
  # - Example of an absolute path:
  # dataset_file: /projects/pytorch/dataset.py
  #
  dataset:
    _target_: dataset.SearchDataset

  # The data type of input images. Two values are supported:
  # - uint8. In that case, all input images should be NumPy arrays with the np.uint8 data type and values in the range
  #   [0, 255].
  # - float32. In that case, all input images should be NumPy arrays with the np.float32 data type and values in the
  #   range [0.0, 1.0].
  input_dtype: uint8

  # A list of preprocessing augmentations that will be applied to each image before applying augmentations from
  # a policy. A preprocessing augmentation should be defined as `key`: `value`, where `key` is the name of augmentation
  # from Albumentations, and `value` is a dictionary with augmentation parameters. The found policy will also apply
  # those preprocessing augmentations before applying the main augmentations.
  #
  # Here is an example of an augmentation pipeline that first pads an image to the size 512x512 pixels, then resizes
  # the resulting image to the size 256x256 pixels and finally crops a random patch with the size 224x224 pixels.
  #
  #  preprocessing:
  #    - PadIfNeeded:
  #        min_height: 512
  #        min_width: 512
  #    - Resize:
  #        height: 256
  #        width: 256
  #    - RandomCrop:
  #        height: 224
  #        width: 224
  #
  preprocessing:
    - Resize:
        height: 128
        width: 128

  # Normalization values for images. For each image, the search pipeline will subtract `mean` and divide by `std`.
  # Normalization is applied after transforms defined in `preprocessing`. Note that regardless of `input_dtype`,
  # the normalization function will always receive a `float32` input with values in the range [0.0, 1.0], so you should
  # define `mean` and `std` values accordingly. ImageNet normalization is used by default.
  normalization:
    mean: [0.485, 0.456, 0.406]
    std: [0.229, 0.224, 0.225]

  # Parameters for the PyTorch DataLoader. Please refer to the PyTorch documentation for the description of parameters -
  # https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader.
  dataloader:
    _target_: torch.utils.data.DataLoader
    batch_size: 64
    shuffle: True
    num_workers: 4
    pin_memory: True
    drop_last: True

optim:
  # Number of epochs to search parameters of augmentations.
  epochs: 60

  # Optimizer configuration for Semantic Segmentation Model
  main:
    _target_: torch.optim.Adam
    lr: 1e-3
    betas: [0, 0.999]

  # Optimizer configuration for Policy Model
  policy:
    _target_: torch.optim.Adam
    lr: 1e-3
    betas: [0, 0.999]

# Device that will keep PyTorch Tensors and which will be used for training. Please refer to the PyTorch documentation
# for more information -  https://pytorch.org/docs/stable/tensor_attributes.html#torch.torch.device.
device: cuda

# Value for torch.backends.cudnn.benchmark
# https://pytorch.org/docs/stable/notes/randomness.html#cuda-convolution-benchmarking
cudnn_benchmark: True

# If set to `True` AutoAlbument will save a checkpoint that contains states of models and optimizers at the end of each
# epoch. Checkpoints will be saved to the directory `<working directory>/checkpoints`.
save_checkpoints: False

# Path to a PyTorch checkpoint that contains saved states of models and optimizers. The value should be an absolute path
# to a file. If set, AutoAlbument will resume the searching process with data from the checkpoint.
checkpoint_path: null

# Path to a directory in which AutoAlbument will save TensorBoard logs. Set the value to `null` if you want to disable
# this feature.
tensorboard_logs_dir: null

hydra:
  run:
    # Path to the directory that will contain all outputs produced by the search algorithm. `${config_dir:}` contains
    # path to the directory with the `search.yaml` config file. Please refer to the Hydra documentation for more
    # information - https://hydra.cc/docs/configure_hydra/workdir.
    dir: ${config_dir:}/outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}