# Semantic segmentation on the Pascal VOC dataset¶

The following files are also available on GitHub - https://github.com/albumentations-team/autoalbument/tree/master/examples/pascal_voc

## dataset.py¶

import cv2
import numpy as np
from torchvision.datasets import VOCSegmentation

VOC_CLASSES = [
"background",
"aeroplane",
"bicycle",
"bird",
"boat",
"bottle",
"bus",
"car",
"cat",
"chair",
"cow",
"diningtable",
"dog",
"horse",
"motorbike",
"person",
"potted plant",
"sheep",
"sofa",
"train",
"tv/monitor",
]

VOC_COLORMAP = [
[0, 0, 0],
[128, 0, 0],
[0, 128, 0],
[128, 128, 0],
[0, 0, 128],
[128, 0, 128],
[0, 128, 128],
[128, 128, 128],
[64, 0, 0],
[192, 0, 0],
[64, 128, 0],
[192, 128, 0],
[64, 0, 128],
[192, 0, 128],
[64, 128, 128],
[192, 128, 128],
[0, 64, 0],
[128, 64, 0],
[0, 192, 0],
[128, 192, 0],
[0, 64, 128],
]

class SearchDataset(VOCSegmentation):
def __init__(self, image_set="train", transform=None):

@staticmethod
# This function converts a mask from the Pascal VOC format to the format required by AutoAlbument.
#
# Pascal VOC uses an RGB image to encode the segmentation mask for that image. RGB values of a pixel
# encode the pixel's class.
#
# AutoAlbument requires a segmentation mask to be a NumPy array with the shape [height, width, num_classes].
# Each channel in this mask should encode values for a single class. Pixel in a mask channel should have
# a value of 1.0 if the pixel of the image belongs to this class and 0.0 otherwise.
segmentation_mask = np.zeros((height, width, len(VOC_COLORMAP)), dtype=np.float32)
for label_index, label in enumerate(VOC_COLORMAP):

def __getitem__(self, index):
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
if self.transform is not None:
image = transformed["image"]


## search.yaml¶

# @package _global_

# Settings for Policy Model that searches augmentation policies.
policy_model:

# Multiplier for segmentation loss of a model. Faster AutoAugment uses segmentation loss to prevent augmentations
# from transforming images of a particular class to another class.

# Multiplier for the gradient penalty for WGAN-GP training. 10 is the default value that was proposed in
# Improved Training of Wasserstein GANs.
gp_factor: 10

# Temperature for Relaxed Bernoulli distribution. The probability of applying a certain augmentation is sampled from
# Relaxed Bernoulli distribution (because Bernoulli distribution is not differentiable). With lower values of
# temperature Relaxed Bernoulli distribution behaves like Bernoulli distribution. In the paper, the authors
# of Faster AutoAugment used 0.05 as a default value for temperature.
temperature: 0.05

# Number of augmentation sub-policies. When an image passes through an augmentation pipeline, Faster AutoAugment
# randomly chooses one sub-policy and uses augmentations from that sub-policy to transform an input image. A larger
# number of sub-policies leads to a more diverse set of augmentations and better performance of a model trained on
# augmented images. However, an increase in the number of sub-policies leads to the exponential growth of a search
# space of augmentations, so you need more training data for Policy Model to find good augmentation policies.
num_sub_policies: 40

# Number of chunks in a batch. Faster AutoAugment splits each batch of images into num_chunks chunks. Then it
# applies the same sub-policy with the same parameters to each image in a chunk. This parameter controls the tradeoff
# between the speed of augmentation search and diversity of augmentations. Larger num_chunks values will lead to
# faster searching but less diverse set of augmentations. Note that this parameter is used only in the searching
# phase. When you train a model with found sub-policies, Albumentations will apply a distinct set of transformations
# to each image separately.
num_chunks: 4

# Number of consecutive augmentations in each sub-policy. Faster AutoAugment will sequentially apply operation_count
# augmentations from a sub-policy to an image. Larger values of operation_count lead to better performance of
# a model trained on augmented images. Simultaneously, larger values of operation_count affect the speed of search
# and increase the searching time.
operation_count: 4

# Settings for Semantic Segmentation Model that is used for two purposes:
# 1. As a model that performs semantic segmentation of input images.
# 2. As a Discriminator for Policy Model.
semantic_segmentation_model:

# The number of classes in the dataset. The dataset implementation should return a mask as a NumPy array with
# the shape [height, width, num_classes]. In a case of binary segmentation you can set num_classes to 1.
num_classes: 21

# The architecture of Semantic Segmentation Model. AutoAlbument uses models from
# https://github.com/qubvel/segmentation_models.pytorch. Please refer to its documentation to get a list of available
# models - https://github.com/qubvel/segmentation_models.pytorch#models-.
architecture: Unet

# The architecture of encoder in Semantic Segmentation Model. Please refer to Segmentation Models' documentation to
# get a list of available encoders - https://github.com/qubvel/segmentation_models.pytorch#encoders-
encoder_architecture: se_resnext50_32x4d

# Either boolean flag or string with that indicates whether the selected encoder architecture should load pretrained
# weights or use randomly initialized weights.
# - In the case of boolean flag true means using pretrained weights from ImageNet and false means using randomly
#   initialized weights.
# - In the case of string the value should specify the name of the weights. For the list of available weights please
#   refer to https://github.com/qubvel/segmentation_models.pytorch#encoders-
pretrained: True

data:
# Class for the PyTorch Dataset and arguments to it. AutoAlbument will create an object of this class using
# the instantiate method from Hydra - https://hydra.cc/docs/next/patterns/instantiate_objects/overview/.
#
# Note that the target class value in the _target_ argument should be located inside PYTHONPATH so Hydra could
# find it. The directory with the config file is automatically added to PYTHONPATH, so the default value
# dataset.SearchDataset points to the class SearchDataset from the dataset.py file. This dataset.py file is
# located along with the search.yaml file in the same directory provided by --config-dir.
#
# As an alternative, you could provide a path to a Python file with the dataset using the dataset_file parameter
# instead of the dataset parameter. The Python file should contain the implementation of a PyTorch dataset for
# augmentation search. The dataset class should have named SearchDataset. The value in dataset_file could either
# be a relative or an absolute path ; in the case of a relative path, the path should be relative to this config
# file's location.
#
# - Example of a relative path:
# dataset_file: dataset.py
#
# - Example of an absolute path:
# dataset_file: /projects/pytorch/dataset.py
#
dataset:
_target_: dataset.SearchDataset

# The data type of input images. Two values are supported:
# - uint8. In that case, all input images should be NumPy arrays with the np.uint8 data type and values in the range
#   [0, 255].
# - float32. In that case, all input images should be NumPy arrays with the np.float32 data type and values in the
#   range [0.0, 1.0].
input_dtype: uint8

# A list of preprocessing augmentations that will be applied to each image before applying augmentations from
# a policy. A preprocessing augmentation should be defined as key: value, where key is the name of augmentation
# from Albumentations, and value is a dictionary with augmentation parameters. The found policy will also apply
# those preprocessing augmentations before applying the main augmentations.
#
# Here is an example of an augmentation pipeline that first pads an image to the size 512x512 pixels, then resizes
# the resulting image to the size 256x256 pixels and finally crops a random patch with the size 224x224 pixels.
#
#  preprocessing:
#        min_height: 512
#        min_width: 512
#    - Resize:
#        height: 256
#        width: 256
#    - RandomCrop:
#        height: 224
#        width: 224
#
preprocessing:
- Resize:
height: 128
width: 128

# Normalization values for images. For each image, the search pipeline will subtract mean and divide by std.
# Normalization is applied after transforms defined in preprocessing. Note that regardless of input_dtype,
# the normalization function will always receive a float32 input with values in the range [0.0, 1.0], so you should
# define mean and std values accordingly. ImageNet normalization is used by default.
normalization:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]

# Parameters for the PyTorch DataLoader. Please refer to the PyTorch documentation for the description of parameters -
batch_size: 64
shuffle: True
num_workers: 4
pin_memory: True
drop_last: True

optim:
# Number of epochs to search parameters of augmentations.
epochs: 60

# Optimizer configuration for Semantic Segmentation Model
main:
lr: 1e-3
betas: [0, 0.999]

# Optimizer configuration for Policy Model
policy:
lr: 1e-3
betas: [0, 0.999]

# Device that will keep PyTorch Tensors and which will be used for training. Please refer to the PyTorch documentation
device: cuda

# Value for torch.backends.cudnn.benchmark
# https://pytorch.org/docs/stable/notes/randomness.html#cuda-convolution-benchmarking
cudnn_benchmark: True

# If set to True AutoAlbument will save a checkpoint that contains states of models and optimizers at the end of each
# epoch. Checkpoints will be saved to the directory <working directory>/checkpoints.
save_checkpoints: False

# Path to a PyTorch checkpoint that contains saved states of models and optimizers. The value should be an absolute path
# to a file. If set, AutoAlbument will resume the searching process with data from the checkpoint.
checkpoint_path: null

# Path to a directory in which AutoAlbument will save TensorBoard logs. Set the value to null if you want to disable
# this feature.
tensorboard_logs_dir: null

hydra:
run:
# Path to the directory that will contain all outputs produced by the search algorithm. ${config_dir:} contains # path to the directory with the search.yaml config file. Please refer to the Hydra documentation for more # information - https://hydra.cc/docs/configure_hydra/workdir. dir:${config_dir:}/outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}