D-FINE

D-FINE 是一个强大的实时目标检测器,将 DETR 中的边界框回归任务重新定义为了细粒度的分布优化(FDR),并引入全局最优的定位自蒸馏(GO-LSD),在不增加额外推理和训练成本的情况下,实现了卓越的性能。

Peterande/D-FINE: D-FINE: Redefine Regression Task of DETRs as Fine-grained Distribution Refinement [ICLR 2025 Spotlight]

数据集-OpenDataLab

D-FINE 论文理解_dfine-CSDN博客

D-FINE:实时目标检测的“速度与激情”,精准又高效! - 知乎

训练

准备数据集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import json
import random
import xml.etree.ElementTree as ET
from pathlib import Path

IMG_EXTS = [".jpg", ".jpeg", ".png", ".bmp", ".webp"]

# ==========================
# Hardcoded runtime settings
# ==========================
SETTINGS = {
"images_dir": "coco/images",
"xml_dir": "coco/Annotations",
"output_dir": "coco/converted",
"train_ratio": 0.8,
"val_ratio": 0.1,
"test_ratio": 0.1,
"seed": 42,
}


def parse_xml(xml_path: Path):
root = ET.parse(xml_path).getroot()
filename = (root.findtext("filename") or "").strip()
size = root.find("size")
if size is None:
return None

w = int(float(size.findtext("width", default="0")))
h = int(float(size.findtext("height", default="0")))
if w <= 0 or h <= 0:
return None

objs = []
for obj in root.findall("object"):
name = (obj.findtext("name") or "").strip()
box = obj.find("bndbox")
if not name or box is None:
continue

xmin = float(box.findtext("xmin", default="0"))
ymin = float(box.findtext("ymin", default="0"))
xmax = float(box.findtext("xmax", default="0"))
ymax = float(box.findtext("ymax", default="0"))
if xmax <= xmin or ymax <= ymin:
continue
objs.append((name, xmin, ymin, xmax, ymax))

if not objs:
return None
return filename, w, h, objs


def find_img(images_dir: Path, stem: str):
for ext in IMG_EXTS:
p = images_dir / f"{stem}{ext}"
if p.exists():
return p
return None


def to_coco(samples, cls2id):
images, annotations = [], []
ann_id = 1
for img_id, s in enumerate(samples, start=1):
images.append({
"id": img_id,
"file_name": s["file_name"],
"width": s["width"],
"height": s["height"],
})
for cls_name, xmin, ymin, xmax, ymax in s["objects"]:
w = xmax - xmin
h = ymax - ymin
annotations.append({
"id": ann_id,
"image_id": img_id,
"category_id": cls2id[cls_name],
"bbox": [xmin, ymin, w, h],
"area": w * h,
"iscrowd": 0,
})
ann_id += 1

categories = [
{"id": cid, "name": name, "supercategory": "object"}
for name, cid in sorted(cls2id.items(), key=lambda x: x[1])
]
return {"images": images, "annotations": annotations, "categories": categories}


def dump_json(path: Path, data):
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")


def dump_lines(path: Path, lines):
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text("\n".join(lines) + "\n", encoding="utf-8")


def main():
if abs(SETTINGS["train_ratio"] + SETTINGS["val_ratio"] + SETTINGS["test_ratio"] - 1.0) > 1e-6:
raise ValueError("train/val/test ratio sum must be 1.0")

images_dir = Path(SETTINGS["images_dir"]).resolve()
xml_dir = Path(SETTINGS["xml_dir"]).resolve()
out_dir = Path(SETTINGS["output_dir"]).resolve()

xml_files = sorted([p for p in xml_dir.glob("*.xml") if not p.name.startswith(".")])
if not xml_files:
raise FileNotFoundError(f"No xml files found in {xml_dir}")

samples = []
cls_names = set()
skipped = 0

for xp in xml_files:
parsed = parse_xml(xp)
if parsed is None:
skipped += 1
continue

filename, w, h, objs = parsed
stem = Path(filename).stem if filename else xp.stem
img_path = find_img(images_dir, stem)
if img_path is None:
skipped += 1
continue

cls_names.update([o[0] for o in objs])
samples.append({
"file_name": img_path.name,
"width": w,
"height": h,
"objects": objs,
"abs_path": img_path.as_posix(),
})

if not samples:
raise RuntimeError("No valid samples generated")

cls2id = {name: i + 1 for i, name in enumerate(sorted(cls_names))}

random.Random(SETTINGS["seed"]).shuffle(samples)
n = len(samples)
n_train = int(n * SETTINGS["train_ratio"])
n_val = int(n * SETTINGS["val_ratio"])

train = samples[:n_train]
val = samples[n_train:n_train + n_val]
test = samples[n_train + n_val:]

ann_dir = out_dir / "annotations"
split_dir = out_dir / "splits"

dump_json(ann_dir / "train.json", to_coco(train, cls2id))
dump_json(ann_dir / "val.json", to_coco(val, cls2id))
dump_json(ann_dir / "test.json", to_coco(test, cls2id))

dump_lines(split_dir / "train.txt", [x["abs_path"] for x in train])
dump_lines(split_dir / "val.txt", [x["abs_path"] for x in val])
dump_lines(split_dir / "test.txt", [x["abs_path"] for x in test])

ultralytics_yaml = out_dir / "ultralytics_data.yaml"
ultralytics_yaml.write_text(
"\n".join([
f"train: {(split_dir / 'train.txt').as_posix()}",
f"val: {(split_dir / 'val.txt').as_posix()}",
f"test: {(split_dir / 'test.txt').as_posix()}",
f"nc: {len(cls2id)}",
f"names: {[name for name, _ in sorted(cls2id.items(), key=lambda x: x[1])]}",
]) + "\n",
encoding="utf-8",
)

summary = {
"images_total": n,
"train": len(train),
"val": len(val),
"test": len(test),
"classes": sorted(cls_names),
"num_classes": len(cls_names),
"skipped_files": skipped,
"output_dir": out_dir.as_posix(),
}
dump_json(out_dir / "summary.json", summary)
print(json.dumps(summary, ensure_ascii=False, indent=2))


if __name__ == "__main__":
main()

创建自定义配置文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# 单目标 volleyball 实验配置:S 模型 + Objects365 预训练迁移
# 用法(在 D-FINE 目录下执行):
# python train.py -c ../config/volleyball_s_transfer.yml --use-amp --seed 0 -t weights/dfine_s_obj365.pth

__include__: [
'../D-FINE/configs/dfine/custom/objects365/dfine_hgnetv2_s_obj2custom.yml',
]

# 实验输出目录(相对 D-FINE 工作目录)
output_dir: ./output/exp_s_transfer_obj_aug

# 数据集设置:单类别,且不做 COCO80 类别重映射
num_classes: 1
remap_mscoco_category: False

# ============ 数据增强配置 ============
# 启用更激进的数据增强来改进小数据集性能
train_dataloader:
collate_fn:
# 多尺度训练
base_size: 640
base_size_repeat: 10
stop_epoch: 56
ema_restart_decay: 0.9999
type: BatchImageCollateFunction
dataset:
img_folder: C:/Users/Rick/Desktop/python/ObjectDetection/coco/images
ann_file: C:/Users/Rick/Desktop/python/ObjectDetection/coco/converted/annotations/train.json
return_masks: False
# 强化的数据增强管道
transforms:
type: Compose
ops:
# 1. 光度失真(亮度、对比度、饱和度)
- type: RandomPhotometricDistort
p: 0.7 # 70% 概率应用

# 2. 随机扩展(缩放扩展)
- type: RandomZoomOut
p: 0.2
fill: 0

# 3. 随机 IoU 裁切(对象中心裁切)
- type: RandomIoUCrop
p: 0.8 # 80% 概率应用,有助于小对象检测

# 4. 清理边界框
- type: SanitizeBoundingBoxes
min_size: 1

# 5. 随机水平翻转
- type: RandomHorizontalFlip
p: 0.5

# 6. 随机竖直翻转(排球检测中有用)
- type: RandomVerticalFlip
p: 0.3

# 7. 调整到固定尺寸
- type: Resize
size: [640, 640]

# 8. 再次清理边界框
- type: SanitizeBoundingBoxes
min_size: 1

# 9. 转换为张量
- type: ConvertPILImage
dtype: float32
scale: True

# 10. 转换边界框格式
- type: ConvertBoxes
fmt: cxcywh
normalize: True

policy:
name: stop_epoch
epoch: 56
ops: [RandomPhotometricDistort, RandomZoomOut, RandomIoUCrop]
type: CocoDetection
drop_last: True
num_workers: 4
shuffle: True
total_batch_size: 32
type: DataLoader

train_dataloader:
dataset:
# 训练图像目录
img_folder: C:/Users/Rick/Desktop/python/ObjectDetection/coco/images
# 训练标注(COCO 格式)
ann_file: C:/Users/Rick/Desktop/python/ObjectDetection/coco/converted/annotations/train.json

val_dataloader:
collate_fn:
type: BatchImageCollateFunction
dataset:
img_folder: C:/Users/Rick/Desktop/python/ObjectDetection/coco/images
ann_file: C:/Users/Rick/Desktop/python/ObjectDetection/coco/converted/annotations/val.json
return_masks: False
# 验证集使用最小增强
transforms:
type: Compose
ops:
# 只进行必需的变换,不做随机增强
- type: Resize
size: [640, 640]
- type: ConvertPILImage
dtype: float32
scale: True
- type: ConvertBoxes
fmt: cxcywh
normalize: True
type: CocoDetection
drop_last: False
num_workers: 4
shuffle: False
total_batch_size: 64
type: DataLoader

# 每 N 轮额外保存一次 checkpointxxxx.pth(last/best 仍会按训练逻辑保存)
checkpoint_freq: 6 # 从12改为6,保存更频繁

# 训练总轮次
epochs: 160 # 从120增加到160,更长的训练

# ========== 额外优化参数 ==========
# 学习率预热
lr_warmup_scheduler:
type: LinearWarmup
warmup_duration: 500 # 预热500步

# EMA模型平滑
ema:
type: ModelEMA
decay: 0.9999
start: 0
warmups: 2000

# 梯度裁减
clip_max_norm: 0.1

# 同步批归一化(多卡训练)
sync_bn: True

开始训练

1
2
3
4
5
6
7
8
cd D-FINE

python train.py -c ../config/volleyball_s_transfer.yml --use-amp --seed 0 -t weights/dfine_s_obj365.pth
python train.py -c ../config/volleyball_s_transfer.yml --use-amp --seed 0 -r output/exp_s_transfer/last.pth

+ -c 接config配置文件路径
+ -t 从预训练模型路径迁移tuning
+ -r 从last模型开始继续训练resume

分析和评估

分析

对于单目标排球训练,使用900张少量数据集迁移。

  • exp_s_transfer:obj365预训练模型迁移
  • exp_s_transfer_obj_aug:obj预训练模型+数据增强
  • exp_s_transfer_obj2coco_aug:obj2coco预训练模型+数据增强
1
2
3
4
5
6
7
8
9
10
11
12
`Test/coco_eval_bbox_0`(Precision:AP50:95-all)
`Test/coco_eval_bbox_1`(Precision:AP50)
`Test/coco_eval_bbox_2`(Precision:AP70)
`Test/coco_eval_bbox_3`(Precision:0.50:0.95-small)
`Test/coco_eval_bbox_4`(Precision:0.50:0.95-medium)
`Test/coco_eval_bbox_5`(Precision:0.50:0.95-large)
`Test/coco_eval_bbox_6`(Recall:0.50:0.95-all-maxDets=1)
`Test/coco_eval_bbox_7`(Recall:0.50:0.95-all-maxDets=10)
`Test/coco_eval_bbox_8`(Recall:0.50:0.95-all-maxDets=100)
`Test/coco_eval_bbox_9`(Recall:0.50:0.95-small-maxDets=100)
`Test/coco_eval_bbox_10`(Recall:0.50:0.95-medium-maxDets=100)
`Test/coco_eval_bbox_11`(Recall:0.50:0.95-large-maxDets=100)

收敛:蓝色曲线的 AP(coco_eval_bbox_0)在大约 60 个 Epoch 前迅速上升,随后进入平台期

红色曲线exp_s_transfer_obj_aug 前期训练loss抖动,而且AP/AR无法快速上升(后续没啥问题,都能用),疑似数据集过小问题,停止训练。obj 数据集里确实有“排球”这个类别(Class 240),但是COCO 的标注更为精准,且包含大量小目标。obj2coco数据集的质量和多样性更好。(官方说Objects365 预训练模型泛化性最好,其实差不多,感觉Objects365+COCO更好

无法收敛:橙黄色曲线exp_s_transferobj365预训练模型,在训练了120Epoch后依旧无法收敛,表明900+数据集过小。

小目标缺失: coco_eval_bbox_3/9(代表 Small 尺寸的 AP/AR)。表示数据集中没有配置好小目标。

Loss 曲线在现代优化器(如 AdamW + Cosine Annealing)的加持下,通常都会表现良好,一般数据集没有问题,不会出现问题。主要查看AP/AR曲线即可。

评估

主要查看COCO 评估指标能够准确判断模型情况。

D-FINE默认提供了模型在不同 IoU 阈值目标尺寸 下的精度(Precision)和召回率(Recall)等参数。

  • IoU (Intersection over Union): 预测框与真实框的重叠度
  • AP (Average Precision): 精确率曲线下的面积。
  • AR (Average Recall): 召回率,即模型找全目标的能力。
术语 全称 含义 预测情况
TP True Positive 真正例 模型说是正类,实际也是正类
TN True Negative 真负例 模型说是负类,实际也是负类
FP False Positive 假正例 模型说是正类,实际却是负类
FN False Negative 假负例 模型说是负类,实际却是正类

精确率 (Precision)

定义: 在模型所有预测为正类的样本中,真正是对的占比

$$Precision = \frac{TP}{TP + FP}$$

高精确率 (High Precision)表示模型检测到10个目标,有9个是正确的,检测精准正确。可能在很有把握才识别,容易漏识别,即低Recall

召回率 (Recall)

定义:所有实际为正类的样本中,模型成功找出的占比

$$Recall = \frac{TP}{TP + FN}$$

高召回率 (High Recall)表示模型在10个真正目标中,检测到9个,很少漏检,可能过于自信,检测到假目标,即低Precision

精确率和召回率往往是此消彼长的(Trade-off),可以使用F1-Score 来平衡这两者

主要查看指标

AP @[ IoU=0.50:0.95 ] :单个类别的平均精度,这是最重要的综合指标。它计算了 IoU 从 0.5 到 0.95(步长 0.05)的平均值:0.5:0.05:0.95

mAP @[ IoU=0.50:0.95 ] :多个类别平均精度的平均值,模型检测各个class的AP的平均。

AP @[ IoU=50/75]:IoU 阈值为 X 时的 AP,预测框和真实框的 IoU > 0.5,就认为它是 TP,较为宽容,而75则更为严格,要求模型的预测框必须与真实框高度重合

area=small/medium/large 测试集中目标大小得分

可视化

使用fiftyone可视化。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
import json
import sys
from collections import defaultdict
from pathlib import Path

import fiftyone as fo
import fiftyone.core.labels as fol
import torch
import torchvision.transforms as T
from PIL import Image
from tqdm import tqdm


# ==========================
# Hardcoded runtime settings
# ==========================
# 修改这里即可,无需命令行传参
SETTINGS = {
"repo_dir": "./D-FINE",
"config": "./config/volleyball_s_transfer.yml",
"checkpoint": "./D-FINE/output/exp_s_transfer_obj2coco_aug/best_stg1.pth", # 最新的最佳权重
"img_root": "./coco/images",
"ann_file": "./coco/converted/annotations/val.json",
"dataset_name": "volleyball-val-s-b1",
"eval_key": "eval",
"device": "cuda:0",
"input_size": 640,
"conf_thres": 0.25,
"limit": 0, # 0 = all
"overwrite": True,
"no_app": False,
}


def load_dfine_model(repo_dir: Path, config_path: str, checkpoint_path: str, device: str):
sys.path.insert(0, str(repo_dir.resolve()))
from src.core import YAMLConfig

safe_config_path = prepare_windows_readable_config(Path(config_path))
try:
cfg = YAMLConfig(str(safe_config_path), resume=checkpoint_path)
finally:
cleanup_temp_config(safe_config_path, Path(config_path))
if "HGNetv2" in cfg.yaml_cfg:
cfg.yaml_cfg["HGNetv2"]["pretrained"] = False

try:
checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
except TypeError:
checkpoint = torch.load(checkpoint_path, map_location="cpu")
except Exception:
checkpoint = torch.load(checkpoint_path, map_location="cpu")
state = checkpoint["ema"]["module"] if "ema" in checkpoint else checkpoint["model"]
cfg.model.load_state_dict(state, strict=False)

model = cfg.model.deploy().to(device).eval()
postprocessor = cfg.postprocessor.deploy().to(device).eval()
return model, postprocessor


def prepare_windows_readable_config(config_path: Path) -> Path:
"""
D-FINE's yaml loader opens files without explicit encoding.
On Windows this can default to GBK and fail on UTF-8 comments.
This helper creates a GBK-readable temporary copy in the same directory.
"""
if not config_path.exists():
raise FileNotFoundError(f"Config file not found: {config_path}")

text = None
for enc in ("utf-8-sig", "utf-8", "gbk"):
try:
text = config_path.read_text(encoding=enc)
break
except UnicodeDecodeError:
continue
if text is None:
raise RuntimeError(f"Cannot decode config file: {config_path}")

# Build an ASCII-only temporary config so both UTF-8 and GBK default decoders can read it.
safe_text = text.encode("ascii", errors="ignore").decode("ascii")
tmp_path = config_path.parent / f".fo_tmp_{config_path.stem}.yml"
tmp_path.write_text(safe_text, encoding="utf-8")
return tmp_path


def cleanup_temp_config(tmp_path: Path, original_config_path: Path):
# Do not remove if it's exactly the original file path
if tmp_path.resolve() == original_config_path.resolve():
return
try:
if tmp_path.exists():
tmp_path.unlink()
except Exception:
pass


def load_coco(ann_file: Path):
data = json.loads(ann_file.read_text(encoding="utf-8"))
images = data["images"]
annotations = data["annotations"]
categories = data["categories"]

anns_by_image = defaultdict(list)
for ann in annotations:
anns_by_image[ann["image_id"]].append(ann)

cat_id_to_name = {c["id"]: c["name"] for c in categories}
return images, anns_by_image, cat_id_to_name


def clamp01(x):
return max(0.0, min(1.0, x))


def coco_bbox_to_fo_detection(bbox_xywh, w, h, label, confidence=None):
x, y, bw, bh = bbox_xywh
nx = clamp01(x / w)
ny = clamp01(y / h)
nw = clamp01(bw / w)
nh = clamp01(bh / h)
kwargs = dict(label=label, bounding_box=[nx, ny, nw, nh])
if confidence is not None:
kwargs["confidence"] = float(confidence)
return fol.Detection(**kwargs)


def xyxy_abs_to_fo_detection(box_xyxy, w, h, label, confidence):
x1, y1, x2, y2 = box_xyxy
x1 = clamp01(float(x1) / w)
y1 = clamp01(float(y1) / h)
x2 = clamp01(float(x2) / w)
y2 = clamp01(float(y2) / h)
bw = clamp01(x2 - x1)
bh = clamp01(y2 - y1)
return fol.Detection(
label=label,
bounding_box=[x1, y1, bw, bh],
confidence=float(confidence),
)


def unpack_predictions(pred_output):
"""
Support both formats:
1) deploy mode tuple: (labels, boxes, scores), each shape [B, Q, ...]
2) training/eval mode list[dict]: [{'labels','boxes','scores'}, ...]
Returns tensors for a single sample: labels_1d, boxes_2d, scores_1d
"""
# deploy tuple
if isinstance(pred_output, (tuple, list)) and len(pred_output) == 3 and torch.is_tensor(pred_output[0]):
labels_b, boxes_b, scores_b = pred_output
return labels_b[0], boxes_b[0], scores_b[0]

# list of dicts
if isinstance(pred_output, list) and len(pred_output) > 0 and isinstance(pred_output[0], dict):
first = pred_output[0]
return first["labels"], first["boxes"], first["scores"]

raise TypeError(f"Unsupported prediction output type: {type(pred_output)}")


@torch.no_grad()
def run():
cfg = SETTINGS
repo_dir = Path(cfg["repo_dir"])
img_root = Path(cfg["img_root"])
ann_file = Path(cfg["ann_file"])

if fo.dataset_exists(cfg["dataset_name"]):
if cfg["overwrite"]:
fo.delete_dataset(cfg["dataset_name"])
else:
raise RuntimeError(
f"Dataset '{cfg['dataset_name']}' already exists. Set SETTINGS['overwrite']=True to replace it."
)

images, anns_by_image, cat_id_to_name = load_coco(ann_file)
if cfg["limit"] and cfg["limit"] > 0:
images = images[: cfg["limit"]]

model, postprocessor = load_dfine_model(
repo_dir, cfg["config"], cfg["checkpoint"], cfg["device"]
)

tfm = T.Compose(
[
T.Resize((cfg["input_size"], cfg["input_size"])),
T.ToTensor(),
]
)

dataset = fo.Dataset(cfg["dataset_name"])

for img_info in tqdm(images, desc="Building FiftyOne dataset"):
image_id = img_info["id"]
file_name = img_info["file_name"]
width = img_info["width"]
height = img_info["height"]
image_path = img_root / file_name

if not image_path.exists():
# Skip missing files instead of hard failing
continue

sample = fo.Sample(filepath=str(image_path.resolve()))

# Ground truth
gt_dets = []
for ann in anns_by_image.get(image_id, []):
cat_id = ann["category_id"]
label = cat_id_to_name.get(cat_id, str(cat_id))
gt_dets.append(coco_bbox_to_fo_detection(ann["bbox"], width, height, label))
sample["ground_truth"] = fol.Detections(detections=gt_dets)

# Prediction
image = Image.open(image_path).convert("RGB")
tensor = tfm(image).unsqueeze(0).to(cfg["device"])
orig_target_sizes = torch.tensor(
[[width, height]], dtype=torch.float32, device=cfg["device"]
)

outputs = model(tensor)
pred_output = postprocessor(outputs, orig_target_sizes)
labels_t, boxes_t, scores_t = unpack_predictions(pred_output)

labels = labels_t.detach().cpu().tolist()
boxes = boxes_t.detach().cpu().tolist()
scores = scores_t.detach().cpu().tolist()

pred_dets = []
for label_id, box, score in zip(labels, boxes, scores):
if score < cfg["conf_thres"]:
continue

# Handle both 0-based and 1-based label id conventions
if label_id in cat_id_to_name:
label_name = cat_id_to_name[label_id]
elif (label_id + 1) in cat_id_to_name:
label_name = cat_id_to_name[label_id + 1]
else:
label_name = str(label_id)

pred_dets.append(xyxy_abs_to_fo_detection(box, width, height, label_name, score))

sample["predictions"] = fol.Detections(detections=pred_dets)
dataset.add_sample(sample)

results = dataset.evaluate_detections(
"predictions",
gt_field="ground_truth",
eval_key=cfg["eval_key"],
compute_mAP=True,
)

try:
print(f"mAP: {results.mAP():.6f}")
except Exception:
print("mAP: unavailable")
print(f"Dataset: {dataset.name}")
print(f"Samples: {len(dataset)}")
print("Use FiftyOne sidebar to inspect FP/FN and per-sample errors.")

if not cfg["no_app"]:
session = fo.launch_app(dataset)
session.wait()


if __name__ == "__main__":
run()

手工标注的数据集与实际目标边缘存在较大误差,实测模型识别得更为精准。可以更新数据集,再次训练使得AP继续上升。

可以将图片添加为负样本继续微调。

负样本微调

咕咕咕。。。

部署

咕咕咕。。。