depth-anythingv2用自己的数据集做绝对深度微调尝试

按照官方的metric_depth文档进行微调。

metric_depth/train.py

import argparse
import logging
import os
import pprint
import random

import warnings
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import torch.distributed as dist
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

from dataset.hypersim import Hypersim
from dataset.kitti import KITTI
from dataset.vkitti2 import VKITTI2
from depth_anything_v2.dpt import DepthAnythingV2
from util.dist_helper import setup_distributed
from util.loss import SiLogLoss
from util.metric import eval_depth
from util.utils import init_log


parser = argparse.ArgumentParser(description='Depth Anything V2 for Metric Depth Estimation')

parser.add_argument('--encoder', default='vitl', choices=['vits', 'vitb', 'vitl', 'vitg'])
parser.add_argument('--dataset', default='hypersim', choices=['hypersim', 'vkitti'])
parser.add_argument('--img-size', default=518, type=int)
parser.add_argument('--min-depth', default=0.001, type=float)
parser.add_argument('--max-depth', default=20, type=float)
parser.add_argument('--epochs', default=100, type=int)
parser.add_argument('--bs', default=1, type=int)
parser.add_argument('--lr', default=0.000005, type=float)
parser.add_argument('--pretrained-from',  default="./checkpoints/depth_anything_v2_vitl.pth",type=str)
parser.add_argument('--save-path', type=str, required=True)
parser.add_argument('--local-rank', default=0, type=int)
parser.add_argument('--port', default=None, type=int)


def main():
    args = parser.parse_args()
    
    warnings.simplefilter('ignore', np.RankWarning)
    
    logger = init_log('global', logging.INFO)
    logger.propagate = 0
    
    rank, world_size = setup_distributed(port=args.port)
    
    if rank == 0:
        all_args = {**vars(args), 'ngpus': world_size}
        logger.info('{}\n'.format(pprint.pformat(all_args)))
        writer = SummaryWriter(args.save_path)
    
    cudnn.enabled = True
    cudnn.benchmark = True
    
    size = (args.img_size, args.img_size)
    if args.dataset == 'hypersim':
        trainset = Hypersim('dataset/splits/hypersim/train.txt', 'train', size=size)
    elif args.dataset == 'vkitti':
        trainset = KITTI('dataset/splits/vkitti2/train.txt', 'train', size=size)
    else:
        raise NotImplementedError
    trainsampler = torch.utils.data.distributed.DistributedSampler(trainset)
    trainloader = DataLoader(trainset, batch_size=args.bs, pin_memory=True, num_workers=4, drop_last=True, sampler=trainsampler)
    
    if args.dataset == 'hypersim':
        valset = Hypersim('dataset/splits/hypersim/val.txt', 'val', size=size)
    elif args.dataset == 'vkitti':
        valset = KITTI('dataset/splits/kitti/val.txt', 'train', size=size)
    else:
        raise NotImplementedError
    valsampler = torch.utils.data.distributed.DistributedSampler(valset)
    valloader = DataLoader(valset, batch_size=1, pin_memory=True, num_workers=4, drop_last=True, sampler=valsampler)
    
    local_rank = int(os.environ["LOCAL_RANK"])
    
    model_configs = {
        'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
        'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
        'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
        'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
    }
    model = DepthAnythingV2(**{**model_configs[args.encoder], 'max_depth': args.max_depth})
    
    if args.pretrained_from:
        model.load_state_dict({k: v for k, v in torch.load(args.pretrained_from, map_location='cpu').items() if 'pretrained' in k}, strict=False)
    
    model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    model.cuda(local_rank)
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], broadcast_buffers=False,
                                                      output_device=local_rank, find_unused_parameters=True)
    
    criterion = SiLogLoss().cuda(local_rank)
    
    optimizer = AdamW([{'params': [param for name, param in model.named_parameters() if 'pretrained' in name], 'lr': args.lr},
                       {'params': [param for name, param in model.named_parameters() if 'pretrained' not in name], 'lr': args.lr * 10.0}],
                      lr=args.lr, betas=(0.9, 0.999), weight_decay=0.01)
    
    total_iters = args.epochs * len(trainloader)
    
    previous_best = {'d1': 0, 'd2': 0, 'd3': 0, 'abs_rel': 100, 'sq_rel': 100, 'rmse': 100, 'rmse_log': 100, 'log10': 100, 'silog': 100}
    
    for epoch in range(args.epochs):
        if rank == 0:
            logger.info('===========> Epoch: {:}/{:}, d1: {:.3f}, d2: {:.3f}, d3: {:.3f}'.format(epoch, args.epochs, previous_best['d1'], previous_best['d2'], previous_best['d3']))
            logger.info('===========> Epoch: {:}/{:}, abs_rel: {:.3f}, sq_rel: {:.3f}, rmse: {:.3f}, rmse_log: {:.3f}, '
                        'log10: {:.3f}, silog: {:.3f}'.format(
                            epoch, args.epochs, previous_best['abs_rel'], previous_best['sq_rel'], previous_best['rmse'], 
                            previous_best['rmse_log'], previous_best['log10'], previous_best['silog']))
        
        trainloader.sampler.set_epoch(epoch + 1)
        
        model.train()
        total_loss = 0
        
        for i, sample in enumerate(trainloader):
            optimizer.zero_grad()
            
            img, depth, valid_mask = sample['image'].cuda(), sample['depth'].cuda(), sample['valid_mask'].cuda()
            print(f"[DEBUG] img shape: {img.shape}")
            print(f"[DEBUG] depth shape: {depth.shape}")
            print(f"[DEBUG] valid_mask shape: {valid_mask.shape}")

            if random.random() < 0.5:
                img = img.flip(-1)
                depth = depth.flip(-1)
                valid_mask = valid_mask.flip(-1)
            
            pred = model(img)
            print(f"[DEBUG] pred shape: {pred.shape}")
            loss = criterion(pred, depth, (valid_mask == 1) & (depth >= args.min_depth) & (depth <= args.max_depth))

            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            iters = epoch * len(trainloader) + i
            
            lr = args.lr * (1 - iters / total_iters) ** 0.9
            
            optimizer.param_groups[0]["lr"] = lr
            optimizer.param_groups[1]["lr"] = lr * 10.0
            
            if rank == 0:
                writer.add_scalar('train/loss', loss.item(), iters)
            
            if rank == 0 and i % 100 == 0:
                logger.info('Iter: {}/{}, LR: {:.7f}, Loss: {:.3f}'.format(i, len(trainloader), optimizer.param_groups[0]['lr'], loss.item()))
        
        model.eval()
        
        results = {'d1': torch.tensor([0.0]).cuda(), 'd2': torch.tensor([0.0]).cuda(), 'd3': torch.tensor([0.0]).cuda(), 
                   'abs_rel': torch.tensor([0.0]).cuda(), 'sq_rel': torch.tensor([0.0]).cuda(), 'rmse': torch.tensor([0.0]).cuda(), 
                   'rmse_log': torch.tensor([0.0]).cuda(), 'log10': torch.tensor([0.0]).cuda(), 'silog': torch.tensor([0.0]).cuda()}
        nsamples = torch.tensor([0.0]).cuda()
        
        for i, sample in enumerate(valloader):
            
            img, depth, valid_mask = sample['image'].cuda().float(), sample['depth'].cuda()[0], sample['valid_mask'].cuda()[0]


            
            with torch.no_grad():
                pred = model(img)
                print(f"[DEBUG] val pred shape: {pred.shape}")
                pred = F.interpolate(pred[:, None], depth.shape[-2:], mode='bilinear', align_corners=True)[0, 0]

            valid_mask = (valid_mask == 1) & (depth >= args.min_depth) & (depth <= args.max_depth)
            print(f"[DEBUG] val depth shape: {depth.shape}")
            print(f"[DEBUG] val valid_mask shape: {valid_mask.shape}")
            if valid_mask.sum() < 10:
                continue
            
            cur_results = eval_depth(pred[valid_mask], depth[valid_mask])
            
            for k in results.keys():
                results[k] += cur_results[k]
            nsamples += 1
        
        torch.distributed.barrier()
        
        for k in results.keys():
            dist.reduce(results[k], dst=0)
        dist.reduce(nsamples, dst=0)
        
        if rank == 0:
            logger.info('==========================================================================================')
            logger.info('{:>8}, {:>8}, {:>8}, {:>8}, {:>8}, {:>8}, {:>8}, {:>8}, {:>8}'.format(*tuple(results.keys())))
            logger.info('{:8.3f}, {:8.3f}, {:8.3f}, {:8.3f}, {:8.3f}, {:8.3f}, {:8.3f}, {:8.3f}, {:8.3f}'.format(*tuple([(v / nsamples).item() for v in results.values()])))
            logger.info('==========================================================================================')
            print()
            
            for name, metric in results.items():
                writer.add_scalar(f'eval/{name}', (metric / nsamples).item(), epoch)
        
        for k in results.keys():
            if k in ['d1', 'd2', 'd3']:
                previous_best[k] = max(previous_best[k], (results[k] / nsamples).item())
            else:
                previous_best[k] = min(previous_best[k], (results[k] / nsamples).item())
        
        if rank == 0:
            checkpoint = {
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epoch,
                'previous_best': previous_best,
            }
            torch.save(checkpoint, os.path.join(args.save_path, 'latest.pth'))


if __name__ == '__main__':
    main()

metric_depth/dataset/kitti.py

import cv2
import torch
from torch.utils.data import Dataset
from torchvision.transforms import Compose

from dataset.transform import Resize, NormalizeImage, PrepareForNet


class KITTI(Dataset):
    def __init__(self, filelist_path, mode, size=(518, 518)):
        #if mode != 'val':
        #    raise NotImplementedError
        
        self.mode = mode
        self.size = size
        
        with open(filelist_path, 'r') as f:
            self.filelist = f.read().splitlines()
        
        net_w, net_h = size
        self.transform = Compose([
            Resize(
                width=net_w,
                height=net_h,
                resize_target=True if mode == 'train' else False,
                keep_aspect_ratio=True,
                ensure_multiple_of=14,
                resize_method='lower_bound',
                image_interpolation_method=cv2.INTER_CUBIC,
            ),
            NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            PrepareForNet(),
        ])
    
    def __getitem__(self, item):
        img_path = self.filelist[item].split(' ')[0]
        depth_path = self.filelist[item].split(' ')[1]
        
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
        
        depth = cv2.imread(depth_path, 0).astype('float32')
        
        sample = self.transform({'image': image, 'depth': depth})
        
        sample['image'] = torch.from_numpy(sample['image'])
        sample['depth'] = torch.from_numpy(sample['depth'])
        sample['depth'] = sample['depth'] / 256.0  # convert in meters
        
        sample['valid_mask'] = sample['depth'] > 0
        
        sample['image_path'] = self.filelist[item].split(' ')[0]
        
        return sample

    def __len__(self):
        return len(self.filelist)

数据集就按照正常的彩色图和深度图(单通道)来,train.txt和val.txt中写入自己的数据集图片(彩色图路径 深度图路径),分别放在metric_depth/dataset/splits/vkitti2和metric_depth/dataset/splits/kitti中,如图所示。

训练指令为

RANK=0 WORLD_SIZE=1 LOCAL_RANK=0 MASTER_ADDR=localhost MASTER_PORT=10002 python train.py --save-path ./checkpoints --dataset vkitti

训练好之后测试指令为

python run.py --img-path /home/liu/Depth-Anything-V2-main/metric_depth/test/rgb --encoder vitl  --load-from checkpoints/latest.pth

我这里训练完成后图像有棋盘格状伪影,查阅相关资料是ConvTranspose2d的原因,在metric_depth/depth_anything_v2/dpt.py中将其替换。

class DPTHead(nn.Module):
    def __init__(
        self, 
        in_channels, 
        features=256, 
        use_bn=False, 
        out_channels=[256, 512, 1024, 1024], 
        use_clstoken=False
    ):
        super(DPTHead, self).__init__()
        
        self.use_clstoken = use_clstoken
        
        self.projects = nn.ModuleList([
            nn.Conv2d(
                in_channels=in_channels,
                out_channels=out_channel,
                kernel_size=1,
                stride=1,
                padding=0,
            ) for out_channel in out_channels
        ])
        
        self.resize_layers = nn.ModuleList([
            # 替换第一个转置卷积 (4x上采样)
            nn.Sequential(
                 nn.Upsample(scale_factor=4, mode='bilinear', align_corners=True),
                 nn.Conv2d(out_channels[0], out_channels[0], kernel_size=3, padding=1)
                 ),
    # 替换第二个转置卷积 (2x上采样)
            nn.Sequential(
                nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True),
                nn.Conv2d(out_channels[1], out_channels[1], kernel_size=3, padding=1)
                ),
            nn.Identity(),
            nn.Conv2d(
                in_channels=out_channels[3],
                out_channels=out_channels[3],
                kernel_size=3,
                stride=2,
                padding=1)
        ])

最终结果如图

微调前模型输出的相对深度图如下图

实际深度图为

简单尝试了一下,不确定是否有操作错误,仅作一个参考。

### 复现 Depth Anything 项目的详细说明 #### 1. 项目概述 Depth Anything 是一种基于大规模未标注数据的单目深度估计基础模型,旨在通过无监督学习的方式提升深度预测的效果[^1]。该项目的核心在于利用大量未标记的数据来训练神经网络,从而减少对昂贵的人工标注数据的依赖。 --- #### 2. 环境配置 为了成功运行 Depth Anything 的源码,需确保开发环境满足以下要求: - **操作系统**: 推荐使用 Linux (Ubuntu 18.04 或更高版本),Windows 和 macOS 用户可以考虑 Docker 容器化部署。 - **Python 版本**: Python >= 3.7. - **PyTorch 及其依赖库**: - PyTorch >= 1.9.0, CUDA Toolkit >= 10.2 (视 GPU 支持情况而定). - torchvision >= 0.10.0. 安装所需依赖项可以通过 `requirements.txt` 文件完成: ```bash pip install -r requirements.txt ``` 如果需要支持特定硬件加速(如 NVIDIA Tensor Cores),建议查阅官方文档调整 PyTorch/CUDA 配置[^1]。 --- #### 3. 数据准备 Depth Anything 利用了大量的未标注图像作为输入数据集。以下是获取和处理这些数据的方法: - **公开数据集下载**: 使用诸如 KITTI、NYUv2 或者 ScanNet 这样的标准视觉数据集进行初步实验。 - **自定义采集**: 如果希望针对特殊场景优化模型性能,则可通过 RealSense D435i 等设备捕获 RGB-D 图像序列[^3]。对于 iPhone Pro 设备用户,可借助 ARKit 提供的功能生成高质量三维重建结果并导出 USDZ 格式的网格模型。 注意,在预处理阶段可能涉及裁剪、缩放以及颜色校正等操作以统一所有样本格式。 --- #### 4. 训练流程 按照如下步骤执行模型训练过程: ##### a) 下载预训练权重 访问 GitHub 页面找到对应分支下的 checkpoint 文件链接,并将其放置于指定目录下以便加载初始化参数: ```python import torch model = YourModelClass() checkpoint = torch.load('path/to/checkpoint.pth') model.load_state_dict(checkpoint['state_dict']) ``` ##### b) 自定义损失函数设计 由于采用的是半监督甚至完全无标签的学习范式,因此需要重新定义适合当前任务目标的新颖评价指标体系。例如结合光流一致性约束或者几何结构先验知识构建复合型代价函数表达式[^1]: ```python def custom_loss(predictions, targets=None): # 假设仅存在 unpaired data loss_value = compute_photometric_consistency_term(predictions) return loss_value ``` ##### c) 启动分布式训练脚本 当拥有多个 GPU 资源可用时推荐启用多进程模式加快收敛速度: ```bash CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 train.py \ --config configs/default.yaml \ --data_path /datasets/your_dataset/ ``` --- #### 5. 测试与评估 完成一轮完整的迭代周期后即可进入验证环节。此时应选取若干典型测试案例分别计算误差统计量并与已有方法对比分析优劣之处。常用定量衡量标准包括但不限于绝对相对差(Abs Rel), 平方根相对差(Sq Rel), 对数均方根误差(RMSE log)[^1]. 此外还可以绘制可视化效果图直观展示改进成果: ```python from matplotlib import pyplot as plt plt.imshow(predicted_depth_map, cmap='plasma') plt.colorbar() plt.title("Predicted Depth Map") plt.show() ``` --- ### 总结 综上所述,复现 Depth Anything 主要分为四个部分工作——搭建适配软硬件平台;整理充足且多样化的原始素材集合;精心调参直至获得满意表现水平最后分享交流心得体验促进社区共同进步成长壮大起来形成良性循环生态链路闭环效应最大化发挥开源精神价值意义所在[^2][^3]. ---
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值