Your ad could be here - Reach CV/ML engineers
Contact for advertisingLib ComparisonFAQAPI Reference
showcasepytorch classificationpytorch semantic segmentationexample bboxesexample keypointsmigrating from torchvision to albumentationsexampleexample bboxes2example chromatic aberrationexample d4example documentsexample domain adaptationexample gridshuffleexample hfhubexample kaggle saltexample mosaicexample multi targetexample OverlayElementsexample textimageexample weather transformsexample xymaskingreplayserialization
Lib ComparisonFAQAPI Reference
showcasepytorch classificationpytorch semantic segmentationexample bboxesexample keypointsmigrating from torchvision to albumentationsexampleexample bboxes2example chromatic aberrationexample d4example documentsexample domain adaptationexample gridshuffleexample hfhubexample kaggle saltexample mosaicexample multi targetexample OverlayElementsexample textimageexample weather transformsexample xymaskingreplayserialization
Overlay Elements 🔗
Code for the transform is based on the code from https://github.com/danaaubakirova/doc-augmentation by Dana Aubakirova
%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload
!pip install -U pillow
Requirement already satisfied: pillow in /opt/homebrew/Caskroom/miniconda/base/envs/albumentations_examples/lib/python3.9/site-packages (11.1.0)
%matplotlib inline
import cv2
import numpy as np
from matplotlib import pyplot as plt
from PIL import Image, ImageDraw, ImageFont
from pylab import *
import albumentations as A
import json
def visualize(image):
plt.figure(figsize=(20, 10))
plt.axis("off")
plt.imshow(image)
font_path = "../data/documents/LiberationSerif-Regular.ttf"
image = cv2.imread("../data/documents/docs.png", cv2.IMREAD_COLOR_RGB)
with open("../data/documents/text.json") as f:
labels = json.load(f)
visualize(image)

transform = A.Compose([A.OverlayElements(p=1)], strict=True, seed=137)
Render images to paste 🔗
def render_text(bbox_shape, text, font):
bbox_height, bbox_width = bbox_shape
# Create an empty RGB image with the size of the bounding box
bbox_img = Image.new("RGB", (bbox_width, bbox_height), color="white")
draw = ImageDraw.Draw(bbox_img)
# Draw the text in red
draw.text((0, 0), text, fill="red", font=font)
return np.array(bbox_img)
bbox_indices_to_update = np.random.choice(range(len(labels["text"])), 10)
labels.keys()
dict_keys(['text', 'bbox', 'poly', 'score'])
image_height, image_width = image.shape[:2]
num_channels = image.shape[2] if len(image.shape) == 3 else 1
metadata = []
for index in bbox_indices_to_update:
selected_bbox = labels["bbox"][index]
# You may apply any transforms you want to text like random deletion, swapping words, applying synonims, etc
text = labels["text"][index]
left, top, width_norm, height_norm = selected_bbox
bbox_height = int(image_height * height_norm)
bbox_width = int(image_width * width_norm)
font = ImageFont.truetype(font_path, int(0.90 * bbox_height))
overlay_image = render_text((bbox_height, bbox_width), text, font)
metadata += [
{
"image": overlay_image,
"bbox": (left, top, left + width_norm, top + height_norm),
},
]
Paste new text to image 🔗
transformed = transform(image=image, overlay_metadata=metadata)
visualize(transformed["image"])

As a part of the augmentation pipeline 🔗
transform_complex = A.Compose(
[
A.OverlayElements(p=1),
A.RandomCrop(p=1, height=1024, width=1024),
A.PlanckianJitter(p=1),
A.Affine(p=1, scale=0.9, shear=10, translate_percent=0.05, rotate=10),
],
)
transformed = transform_complex(image=image, overlay_metadata=metadata)
visualize(transformed["image"])
