How to Train RT-DETR on Custom Dataset¶
RT-DETR, short for "Real-Time DEtection TRansformer", is a computer vision model developed by Peking University and Baidu. In their paper, "DETRs Beat YOLOs on Real-time Object Detection" the authors claim that RT-DETR can outperform YOLO models in object detection, both in terms of speed and accuracy. The model has been released under the Apache 2.0 license, making it a great option, especially for enterprise projects.
Recently, RT-DETR was added to the transformers
library, significantly simplifying its fine-tuning process. In this tutorial, we will show you how to train RT-DETR on a custom dataset.
Setup¶
Configure your API keys¶
To fine-tune RT-DETR, you need to provide your HuggingFace Token and Roboflow API key. Follow these steps:
- Open your
HuggingFace Settings
page. ClickAccess Tokens
thenNew Token
to generate new token. - Go to your
Roboflow Settings
page. ClickCopy
. This will place your private key in the clipboard. - In Colab, go to the left pane and click on
Secrets
(🔑).- Store HuggingFace Access Token under the name
HF_TOKEN
. - Store Roboflow API Key under the name
ROBOFLOW_API_KEY
.
- Store HuggingFace Access Token under the name
Select the runtime¶
Let's make sure that we have access to GPU. We can use nvidia-smi
command to do that. In case of any problems navigate to Edit
-> Notebook settings
-> Hardware accelerator
, set it to L4 GPU
, and then click Save
.
Thu Jul 11 09:20:53 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05 Driver Version: 535.104.05 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |
| N/A 65C P8 11W / 70W | 0MiB / 15360MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| No running processes found |
+---------------------------------------------------------------------------------------+
NOTE: To make it easier for us to manage datasets, images and models we create a HOME
constant.
HOME: /content
Install dependencies¶
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q git+https://github.com/roboflow/supervision.git
!pip install -q accelerate
!pip install -q roboflow
!pip install -q torchmetrics
!pip install -q "albumentations>=1.4.5"
Installing build dependencies ... [?25l[?25hdone
Getting requirements to build wheel ... [?25l[?25hdone
Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
Installing build dependencies ... [?25l[?25hdone
Getting requirements to build wheel ... [?25l[?25hdone
Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheel for supervision (pyproject.toml) ... [?25l[?25hdone
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m71.9 MB/s[0m eta [36m0:00:00[0m
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.2/76.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m178.7/178.7 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.3/165.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.9/14.9 MB[0m [31m83.8 MB/s[0m eta [36m0:00:00[0m
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m92.2 MB/s[0m eta [36m0:00:00[0m
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.5/313.5 kB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25h
Imports¶
import torch
import requests
import numpy as np
import supervision as sv
import albumentations as A
from PIL import Image
from pprint import pprint
from roboflow import Roboflow
from dataclasses import dataclass, replace
from google.colab import userdata
from torch.utils.data import Dataset
from transformers import (
AutoImageProcessor,
AutoModelForObjectDetection,
TrainingArguments,
Trainer
)
from torchmetrics.detection.mean_ap import MeanAveragePrecision
Inference with pre-trained RT-DETR model¶
# @title Load model
CHECKPOINT = "PekingU/rtdetr_r50vd_coco_o365"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForObjectDetection.from_pretrained(CHECKPOINT).to(DEVICE)
processor = AutoImageProcessor.from_pretrained(CHECKPOINT)
config.json: 0%| | 0.00/5.11k [00:00<?, ?B/s]
model.safetensors: 0%| | 0.00/172M [00:00<?, ?B/s]
preprocessor_config.json: 0%| | 0.00/841 [00:00<?, ?B/s]
# @title Run inference
URL = "https://media.roboflow.com/notebooks/examples/dog.jpeg"
image = Image.open(requests.get(URL, stream=True).raw)
inputs = processor(image, return_tensors="pt").to(DEVICE)
with torch.no_grad():
outputs = model(**inputs)
w, h = image.size
results = processor.post_process_object_detection(
outputs, target_sizes=[(h, w)], threshold=0.3)
# @title Display result with NMS
detections = sv.Detections.from_transformers(results[0])
labels = [
model.config.id2label[class_id]
for class_id
in detections.class_id
]
annotated_image = image.copy()
annotated_image = sv.BoundingBoxAnnotator().annotate(annotated_image, detections)
annotated_image = sv.LabelAnnotator().annotate(annotated_image, detections, labels=labels)
annotated_image.thumbnail((600, 600))
annotated_image
# @title Display result with NMS
detections = sv.Detections.from_transformers(results[0]).with_nms(threshold=0.1)
labels = [
model.config.id2label[class_id]
for class_id
in detections.class_id
]
annotated_image = image.copy()
annotated_image = sv.BoundingBoxAnnotator().annotate(annotated_image, detections)
annotated_image = sv.LabelAnnotator().annotate(annotated_image, detections, labels=labels)
annotated_image.thumbnail((600, 600))
annotated_image
Fine-tune RT-DETR on custom dataset¶
# @title Download dataset from Roboflow Universe
ROBOFLOW_API_KEY = userdata.get('ROBOFLOW_API_KEY')
rf = Roboflow(api_key=ROBOFLOW_API_KEY)
project = rf.workspace("roboflow-jvuqo").project("poker-cards-fmjio")
version = project.version(4)
dataset = version.download("coco")
loading Roboflow workspace...
loading Roboflow project...
Downloading Dataset Version Zip in poker-cards-4 to coco:: 100%|██████████| 39123/39123 [00:01<00:00, 27288.54it/s]
Extracting Dataset Version Zip to poker-cards-4 in coco:: 100%|██████████| 907/907 [00:00<00:00, 2984.59it/s]
ds_train = sv.DetectionDataset.from_coco(
images_directory_path=f"{dataset.location}/train",
annotations_path=f"{dataset.location}/train/_annotations.coco.json",
)
ds_valid = sv.DetectionDataset.from_coco(
images_directory_path=f"{dataset.location}/valid",
annotations_path=f"{dataset.location}/valid/_annotations.coco.json",
)
ds_test = sv.DetectionDataset.from_coco(
images_directory_path=f"{dataset.location}/test",
annotations_path=f"{dataset.location}/test/_annotations.coco.json",
)
print(f"Number of training images: {len(ds_train)}")
print(f"Number of validation images: {len(ds_valid)}")
print(f"Number of test images: {len(ds_test)}")
Number of training images: 811
Number of validation images: 44
Number of test images: 44
# @title Display dataset sample
GRID_SIZE = 5
def annotate(image, annotations, classes):
labels = [
classes[class_id]
for class_id
in annotations.class_id
]
bounding_box_annotator = sv.BoundingBoxAnnotator()
label_annotator = sv.LabelAnnotator(text_scale=1, text_thickness=2)
annotated_image = image.copy()
annotated_image = bounding_box_annotator.annotate(annotated_image, annotations)
annotated_image = label_annotator.annotate(annotated_image, annotations, labels=labels)
return annotated_image
annotated_images = []
for i in range(GRID_SIZE * GRID_SIZE):
_, image, annotations = ds_train[i]
annotated_image = annotate(image, annotations, ds_train.classes)
annotated_images.append(annotated_image)
grid = sv.create_tiles(
annotated_images,
grid_size=(GRID_SIZE, GRID_SIZE),
single_tile_size=(400, 400),
tile_padding_color=sv.Color.WHITE,
tile_margin_color=sv.Color.WHITE
)
sv.plot_image(grid, size=(10, 10))
Preprocess the data¶
To finetune a model, you must preprocess the data you plan to use to match precisely the approach used for the pre-trained model. AutoImageProcessor takes care of processing image data to create pixel_values
, pixel_mask
, and labels
that a DETR model can train with. The image processor has some attributes that you won't have to worry about:
image_mean = [0.485, 0.456, 0.406 ]
image_std = [0.229, 0.224, 0.225]
These are the mean and standard deviation used to normalize images during the model pre-training. These values are crucial to replicate when doing inference or finetuning a pre-trained image model.
Instantiate the image processor from the same checkpoint as the model you want to finetune.
IMAGE_SIZE = 480
processor = AutoImageProcessor.from_pretrained(
CHECKPOINT,
do_resize=True,
size={"width": IMAGE_SIZE, "height": IMAGE_SIZE},
)
Before passing the images to the processor
, apply two preprocessing transformations to the dataset:
- Augmenting images
- Reformatting annotations to meet RT-DETR expectations
First, to make sure the model does not overfit on the training data, you can apply image augmentation with any data augmentation library. Here we use Albumentations. This library ensures that transformations affect the image and update the bounding boxes accordingly.
train_augmentation_and_transform = A.Compose(
[
A.Perspective(p=0.1),
A.HorizontalFlip(p=0.5),
A.RandomBrightnessContrast(p=0.5),
A.HueSaturationValue(p=0.1),
],
bbox_params=A.BboxParams(
format="pascal_voc",
label_fields=["category"],
clip=True,
min_area=25
),
)
valid_transform = A.Compose(
[A.NoOp()],
bbox_params=A.BboxParams(
format="pascal_voc",
label_fields=["category"],
clip=True,
min_area=1
),
)
# @title Visualize some augmented images
IMAGE_COUNT = 5
for i in range(IMAGE_COUNT):
_, image, annotations = ds_train[i]
output = train_augmentation_and_transform(
image=image,
bboxes=annotations.xyxy,
category=annotations.class_id
)
augmented_image = output["image"]
augmented_annotations = replace(
annotations,
xyxy=np.array(output["bboxes"]),
class_id=np.array(output["category"])
)
annotated_images = [
annotate(image, annotations, ds_train.classes),
annotate(augmented_image, augmented_annotations, ds_train.classes)
]
grid = sv.create_tiles(
annotated_images,
titles=['original', 'augmented'],
titles_scale=0.5,
single_tile_size=(400, 400),
tile_padding_color=sv.Color.WHITE,
tile_margin_color=sv.Color.WHITE
)
sv.plot_image(grid, size=(6, 6))
The processor
expects the annotations to be in the following format: {'image_id': int, 'annotations': List[Dict]}
, where each dictionary is a COCO object annotation. Let's add a function to reformat annotations for a single example:
class PyTorchDetectionDataset(Dataset):
def __init__(self, dataset: sv.DetectionDataset, processor, transform: A.Compose = None):
self.dataset = dataset
self.processor = processor
self.transform = transform
@staticmethod
def annotations_as_coco(image_id, categories, boxes):
annotations = []
for category, bbox in zip(categories, boxes):
x1, y1, x2, y2 = bbox
formatted_annotation = {
"image_id": image_id,
"category_id": category,
"bbox": [x1, y1, x2 - x1, y2 - y1],
"iscrowd": 0,
"area": (x2 - x1) * (y2 - y1),
}
annotations.append(formatted_annotation)
return {
"image_id": image_id,
"annotations": annotations,
}
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
_, image, annotations = self.dataset[idx]
# Convert image to RGB numpy array
image = image[:, :, ::-1]
boxes = annotations.xyxy
categories = annotations.class_id
if self.transform:
transformed = self.transform(
image=image,
bboxes=boxes,
category=categories
)
image = transformed["image"]
boxes = transformed["bboxes"]
categories = transformed["category"]
formatted_annotations = self.annotations_as_coco(
image_id=idx, categories=categories, boxes=boxes)
result = self.processor(
images=image, annotations=formatted_annotations, return_tensors="pt")
# Image processor expands batch dimension, lets squeeze it
result = {k: v[0] for k, v in result.items()}
return result
Now you can combine the image and annotation transformations to use on a batch of examples:
pytorch_dataset_train = PyTorchDetectionDataset(
ds_train, processor, transform=train_augmentation_and_transform)
pytorch_dataset_valid = PyTorchDetectionDataset(
ds_valid, processor, transform=valid_transform)
pytorch_dataset_test = PyTorchDetectionDataset(
ds_test, processor, transform=valid_transform)
pytorch_dataset_train[15]
{'pixel_values': tensor([[[0.0745, 0.0745, 0.0745, ..., 0.2431, 0.2471, 0.2471],
[0.0745, 0.0745, 0.0745, ..., 0.2510, 0.2549, 0.2549],
[0.0667, 0.0706, 0.0706, ..., 0.2588, 0.2588, 0.2588],
...,
[0.0118, 0.0118, 0.0118, ..., 0.0510, 0.0549, 0.0510],
[0.0157, 0.0196, 0.0235, ..., 0.0549, 0.0627, 0.0549],
[0.0235, 0.0275, 0.0314, ..., 0.0549, 0.0627, 0.0549]],
[[0.0549, 0.0549, 0.0549, ..., 0.3137, 0.3176, 0.3176],
[0.0549, 0.0549, 0.0549, ..., 0.3216, 0.3255, 0.3255],
[0.0471, 0.0510, 0.0510, ..., 0.3294, 0.3294, 0.3294],
...,
[0.0000, 0.0000, 0.0000, ..., 0.0353, 0.0392, 0.0353],
[0.0000, 0.0000, 0.0039, ..., 0.0392, 0.0471, 0.0392],
[0.0000, 0.0039, 0.0078, ..., 0.0392, 0.0471, 0.0392]],
[[0.0431, 0.0431, 0.0431, ..., 0.3922, 0.3961, 0.3961],
[0.0431, 0.0431, 0.0431, ..., 0.4000, 0.4039, 0.4039],
[0.0353, 0.0392, 0.0392, ..., 0.4078, 0.4078, 0.4078],
...,
[0.0000, 0.0000, 0.0000, ..., 0.0314, 0.0353, 0.0314],
[0.0000, 0.0000, 0.0039, ..., 0.0353, 0.0431, 0.0353],
[0.0000, 0.0039, 0.0078, ..., 0.0353, 0.0431, 0.0353]]]),
'labels': {'size': tensor([480, 480]), 'image_id': tensor([15]), 'class_labels': tensor([36, 4, 44, 52, 48]), 'boxes': tensor([[0.7891, 0.4437, 0.2094, 0.3562],
[0.3984, 0.6484, 0.3187, 0.3906],
[0.5891, 0.4070, 0.2219, 0.3859],
[0.3484, 0.2812, 0.2625, 0.4094],
[0.1602, 0.5023, 0.2672, 0.4109]]), 'area': tensor([17185.5000, 28687.5000, 19729.1250, 24759.0000, 25297.3125]), 'iscrowd': tensor([0, 0, 0, 0, 0]), 'orig_size': tensor([640, 640])}}
You have successfully augmented the images and prepared their annotations. In the final step, create a custom collate_fn to batch images together.
def collate_fn(batch):
data = {}
data["pixel_values"] = torch.stack([x["pixel_values"] for x in batch])
data["labels"] = [x["labels"] for x in batch]
return data
Preparing function to compute mAP¶
id2label = {id: label for id, label in enumerate(ds_train.classes)}
label2id = {label: id for id, label in enumerate(ds_train.classes)}
@dataclass
class ModelOutput:
logits: torch.Tensor
pred_boxes: torch.Tensor
class MAPEvaluator:
def __init__(self, image_processor, threshold=0.00, id2label=None):
self.image_processor = image_processor
self.threshold = threshold
self.id2label = id2label
def collect_image_sizes(self, targets):
"""Collect image sizes across the dataset as list of tensors with shape [batch_size, 2]."""
image_sizes = []
for batch in targets:
batch_image_sizes = torch.tensor(np.array([x["size"] for x in batch]))
image_sizes.append(batch_image_sizes)
return image_sizes
def collect_targets(self, targets, image_sizes):
post_processed_targets = []
for target_batch, image_size_batch in zip(targets, image_sizes):
for target, (height, width) in zip(target_batch, image_size_batch):
boxes = target["boxes"]
boxes = sv.xcycwh_to_xyxy(boxes)
boxes = boxes * np.array([width, height, width, height])
boxes = torch.tensor(boxes)
labels = torch.tensor(target["class_labels"])
post_processed_targets.append({"boxes": boxes, "labels": labels})
return post_processed_targets
def collect_predictions(self, predictions, image_sizes):
post_processed_predictions = []
for batch, target_sizes in zip(predictions, image_sizes):
batch_logits, batch_boxes = batch[1], batch[2]
output = ModelOutput(logits=torch.tensor(batch_logits), pred_boxes=torch.tensor(batch_boxes))
post_processed_output = self.image_processor.post_process_object_detection(
output, threshold=self.threshold, target_sizes=target_sizes
)
post_processed_predictions.extend(post_processed_output)
return post_processed_predictions
@torch.no_grad()
def __call__(self, evaluation_results):
predictions, targets = evaluation_results.predictions, evaluation_results.label_ids
image_sizes = self.collect_image_sizes(targets)
post_processed_targets = self.collect_targets(targets, image_sizes)
post_processed_predictions = self.collect_predictions(predictions, image_sizes)
evaluator = MeanAveragePrecision(box_format="xyxy", class_metrics=True)
evaluator.warn_on_many_detections = False
evaluator.update(post_processed_predictions, post_processed_targets)
metrics = evaluator.compute()
# Replace list of per class metrics with separate metric for each class
classes = metrics.pop("classes")
map_per_class = metrics.pop("map_per_class")
mar_100_per_class = metrics.pop("mar_100_per_class")
for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class):
class_name = id2label[class_id.item()] if id2label is not None else class_id.item()
metrics[f"map_{class_name}"] = class_map
metrics[f"mar_100_{class_name}"] = class_mar
metrics = {k: round(v.item(), 4) for k, v in metrics.items()}
return metrics
eval_compute_metrics_fn = MAPEvaluator(image_processor=processor, threshold=0.01, id2label=id2label)
Training the detection model¶
You have done most of the heavy lifting in the previous sections, so now you are ready to train your model! The images in this dataset are still quite large, even after resizing. This means that finetuning this model will require at least one GPU.
Training involves the following steps:
- Load the model with
AutoModelForObjectDetection
using the same checkpoint as in the preprocessing. - Define your training hyperparameters in
TrainingArguments
. - Pass the training arguments to
Trainer
along with the model, dataset, image processor, and data collator. - Call
train()
to finetune your model.
When loading the model from the same checkpoint that you used for the preprocessing, remember to pass the label2id
and id2label
maps that you created earlier from the dataset's metadata. Additionally, we specify ignore_mismatched_sizes=True
to replace the existing classification head with a new one.
model = AutoModelForObjectDetection.from_pretrained(
CHECKPOINT,
id2label=id2label,
label2id=label2id,
anchor_image_size=None,
ignore_mismatched_sizes=True,
)
Some weights of RTDetrForObjectDetection were not initialized from the model checkpoint at PekingU/rtdetr_r50vd_coco_o365 and are newly initialized because the shapes did not match:
- model.decoder.class_embed.0.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([53]) in the model instantiated
- model.decoder.class_embed.0.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([53, 256]) in the model instantiated
- model.decoder.class_embed.1.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([53]) in the model instantiated
- model.decoder.class_embed.1.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([53, 256]) in the model instantiated
- model.decoder.class_embed.2.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([53]) in the model instantiated
- model.decoder.class_embed.2.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([53, 256]) in the model instantiated
- model.decoder.class_embed.3.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([53]) in the model instantiated
- model.decoder.class_embed.3.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([53, 256]) in the model instantiated
- model.decoder.class_embed.4.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([53]) in the model instantiated
- model.decoder.class_embed.4.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([53, 256]) in the model instantiated
- model.decoder.class_embed.5.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([53]) in the model instantiated
- model.decoder.class_embed.5.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([53, 256]) in the model instantiated
- model.denoising_class_embed.weight: found shape torch.Size([81, 256]) in the checkpoint and torch.Size([54, 256]) in the model instantiated
- model.enc_score_head.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([53]) in the model instantiated
- model.enc_score_head.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([53, 256]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
In the TrainingArguments
use output_dir
to specify where to save your model, then configure hyperparameters as you see fit. For num_train_epochs=10
training will take about 15 minutes in Google Colab T4 GPU, increase the number of epoch to get better results.
Important notes:
- Do not remove unused columns because this will drop the image column. Without the image column, you can't create
pixel_values
. For this reason, setremove_unused_columns
toFalse
. - Set
eval_do_concat_batches=False
to get proper evaluation results. Images have different number of target boxes, if batches are concatenated we will not be able to determine which boxes belongs to particular image.
training_args = TrainingArguments(
output_dir=f"{dataset.name.replace(' ', '-')}-finetune",
num_train_epochs=20,
max_grad_norm=0.1,
learning_rate=5e-5,
warmup_steps=300,
per_device_train_batch_size=16,
dataloader_num_workers=2,
metric_for_best_model="eval_map",
greater_is_better=True,
load_best_model_at_end=True,
eval_strategy="epoch",
save_strategy="epoch",
save_total_limit=2,
remove_unused_columns=False,
eval_do_concat_batches=False,
)
Finally, bring everything together, and call train()
:
trainer = Trainer(
model=model,
args=training_args,
train_dataset=pytorch_dataset_train,
eval_dataset=pytorch_dataset_valid,
tokenizer=processor,
data_collator=collate_fn,
compute_metrics=eval_compute_metrics_fn,
)
trainer.train()
Evaluate¶
# @title Collect predictions
targets = []
predictions = []
for i in range(len(ds_test)):
path, sourece_image, annotations = ds_test[i]
image = Image.open(path)
inputs = processor(image, return_tensors="pt").to(DEVICE)
with torch.no_grad():
outputs = model(**inputs)
w, h = image.size
results = processor.post_process_object_detection(
outputs, target_sizes=[(h, w)], threshold=0.3)
detections = sv.Detections.from_transformers(results[0])
targets.append(annotations)
predictions.append(detections)
# @title Calculate mAP
mean_average_precision = sv.MeanAveragePrecision.from_detections(
predictions=predictions,
targets=targets,
)
print(f"map50_95: {mean_average_precision.map50_95:.2f}")
print(f"map50: {mean_average_precision.map50:.2f}")
print(f"map75: {mean_average_precision.map75:.2f}")
map50_95: 0.89
map50: 0.94
map75: 0.94
# @title Calculate Confusion Matrix
confusion_matrix = sv.ConfusionMatrix.from_detections(
predictions=predictions,
targets=targets,
classes=ds_test.classes
)
_ = confusion_matrix.plot()
Save fine-tuned model on hard drive¶
['/content/rt-detr/preprocessor_config.json']
Inference with fine-tuned RT-DETR model¶
IMAGE_COUNT = 5
for i in range(IMAGE_COUNT):
path, sourece_image, annotations = ds_test[i]
image = Image.open(path)
inputs = processor(image, return_tensors="pt").to(DEVICE)
with torch.no_grad():
outputs = model(**inputs)
w, h = image.size
results = processor.post_process_object_detection(
outputs, target_sizes=[(h, w)], threshold=0.3)
detections = sv.Detections.from_transformers(results[0]).with_nms(threshold=0.1)
annotated_images = [
annotate(sourece_image, annotations, ds_train.classes),
annotate(sourece_image, detections, ds_train.classes)
]
grid = sv.create_tiles(
annotated_images,
titles=['ground truth', 'prediction'],
titles_scale=0.5,
single_tile_size=(400, 400),
tile_padding_color=sv.Color.WHITE,
tile_margin_color=sv.Color.WHITE
)
sv.plot_image(grid, size=(6, 6))