Overlay Elements 🔗

Code for the transform is based on the code from https://github.com/danaaubakirova/doc-augmentation by Dana Aubakirova

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload

!pip install -U pillow

Requirement already satisfied: pillow in /opt/homebrew/Caskroom/miniconda/base/envs/albumentations_examples/lib/python3.9/site-packages (11.1.0)

%matplotlib inline
import cv2
import numpy as np
from matplotlib import pyplot as plt
from PIL import Image, ImageDraw, ImageFont
from pylab import *
import albumentations as A
import json
def visualize(image):
    plt.figure(figsize=(20, 10))
    plt.axis("off")
    plt.imshow(image)
font_path = "../data/documents/LiberationSerif-Regular.ttf"
image = cv2.imread("../data/documents/docs.png", cv2.IMREAD_COLOR_RGB)
with open("../data/documents/text.json") as f:
    labels = json.load(f)
visualize(image)

png

transform = A.Compose([A.OverlayElements(p=1)], strict=True, seed=137)

Render images to paste 🔗

def render_text(bbox_shape, text, font):
    bbox_height, bbox_width = bbox_shape
 
    # Create an empty RGB image with the size of the bounding box
    bbox_img = Image.new("RGB", (bbox_width, bbox_height), color="white")
    draw = ImageDraw.Draw(bbox_img)
 
    # Draw the text in red
    draw.text((0, 0), text, fill="red", font=font)
 
    return np.array(bbox_img)
bbox_indices_to_update = np.random.choice(range(len(labels["text"])), 10)
labels.keys()

dict_keys(['text', 'bbox', 'poly', 'score'])

image_height, image_width = image.shape[:2]
num_channels = image.shape[2] if len(image.shape) == 3 else 1
metadata = []
for index in bbox_indices_to_update:
    selected_bbox = labels["bbox"][index]
 
    # You may apply any transforms you want to text like random deletion, swapping words, applying synonims, etc
    text = labels["text"][index]
 
    left, top, width_norm, height_norm = selected_bbox
 
    bbox_height = int(image_height * height_norm)
    bbox_width = int(image_width * width_norm)
 
    font = ImageFont.truetype(font_path, int(0.90 * bbox_height))
 
    overlay_image = render_text((bbox_height, bbox_width), text, font)
 
    metadata += [
        {
            "image": overlay_image,
            "bbox": (left, top, left + width_norm, top + height_norm),
        },
    ]

Paste new text to image 🔗

transformed = transform(image=image, overlay_metadata=metadata)
visualize(transformed["image"])

png

As a part of the augmentation pipeline 🔗

transform_complex = A.Compose(
    [
        A.OverlayElements(p=1),
        A.RandomCrop(p=1, height=1024, width=1024),
        A.PlanckianJitter(p=1),
        A.Affine(p=1, scale=0.9, shear=10, translate_percent=0.05, rotate=10),
    ],
)
transformed = transform_complex(image=image, overlay_metadata=metadata)
visualize(transformed["image"])

png