torch::Tensor input_tensor = torch::from_blob(input_data, {batch_size, channels, height, width}, torch::kFloat32);中 torch::kFloat32是什么意思

import torch.nn as nn import math import torch import torch.nn as nn import torch.nn as nn import torch import torch.nn.functional as F import numpy as np import math import numpy as np from typing import Any, Callable import torch from torch import nn, Tensor from typing import List, Optional import math from ultralytics.nn.modules.conv import Conv from typing import Union var: Union[int, tuple] = 1 # build RepVGG block # ----------------------------- def conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1): result = nn.Sequential() result.add_module('conv', nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, bias=False)) result.add_module('bn', nn.BatchNorm2d(num_features=out_channels)) return result class SEBlock(nn.Module): def init(self, input_channels): super(SEBlock, self).init() internal_neurons = input_channels // 8 self.down = nn.Conv2d(in_channels=input_channels, out_channels=internal_neurons, kernel_size=1, stride=1, bias=True) self.up = nn.Conv2d(in_channels=internal_neurons, out_channels=input_channels, kernel_size=1, stride=1, bias=True) self.input_channels = input_channels def forward(self, inputs): x = F.avg_pool2d(inputs, kernel_size=inputs.size(3)) x = self.down(x) x = F.relu(x) x = self.up(x) x = torch.sigmoid(x) x = x.view(-1, self.input_channels, 1, 1) return inputs * x class RepVGG(nn.Module): def init(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, dilation=1, groups=1, padding_mode='zeros', deploy=False, use_se=False): super(RepVGG, self).init() self.deploy = deploy self.groups = groups self.in_channels = in_channels padding_11 = padding - kernel_size // 2 self.nonlinearity = nn.SiLU() # self.nonlinearity = nn.ReLU() if use_se: self.se = SEBlock(out_channels, internal_neurons=out_channels // 16) else: self.se = nn.Identity() if deploy: self.rbr_reparam = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=True, padding_mode=padding_mode) else: self.rbr_identity = nn.BatchNorm2d( num_features=in_channels) if out_channels == in_channels and stride == 1 else None self.rbr_dense = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups) self.rbr_1x1 = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=stride, padding=padding_11, groups=groups) # print('RepVGG Block, identity = ', self.rbr_identity) def get_equivalent_kernel_bias(self): kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense) kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1) kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity) return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid def _pad_1x1_to_3x3_tensor(self, kernel1x1): if kernel1x1 is None: return 0 else: return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1]) def _fuse_bn_tensor(self, branch): if branch is None: return 0, 0 if isinstance(branch, nn.Sequential): kernel = branch.conv.weight running_mean = branch.bn.running_mean running_var = branch.bn.running_var gamma = branch.bn.weight beta = branch.bn.bias eps = branch.bn.eps else: assert isinstance(branch, nn.BatchNorm2d) if not hasattr(self, 'id_tensor'): input_dim = self.in_channels // self.groups kernel_value = np.zeros((self.in_channels, input_dim, 3, 3), dtype=np.float32) for i in range(self.in_channels): kernel_value[i, i % input_dim, 1, 1] = 1 self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device) kernel = self.id_tensor running_mean = branch.running_mean running_var = branch.running_var gamma = branch.weight beta = branch.bias eps = branch.eps std = (running_var + eps).sqrt() t = (gamma / std).reshape(-1, 1, 1, 1) return kernel * t, beta - running_mean * gamma / std def forward(self, inputs): if hasattr(self, 'rbr_reparam'): return self.nonlinearity(self.se(self.rbr_reparam(inputs))) if self.rbr_identity is None: id_out = 0 else: id_out = self.rbr_identity(inputs) return self.nonlinearity(self.se(self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out)) def fusevggforward(self, x): return self.nonlinearity(self.rbr_dense(x)) # RepVGG block end # ----------------------------- def autopad(k, p=None, d=1): # kernel, padding, dilation """Pad to 'same' shape outputs.""" if d > 1: k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size if p is None: p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad return p def makeDivisible(v: float, divisor: int, min_value: Optional[int] = None) -> int: """ This function is taken from the original tf repo. It ensures that all layers have a channel number that is divisible by 8 It can be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.Py """ if min_value is None: min_value = divisor new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than 10%. if new_v < 0.9 * v: new_v += divisor return new_v def callMethod(self, ElementName): return getattr(self, ElementName) def setMethod(self, ElementName, ElementValue): return setattr(self, ElementName, ElementValue) def shuffleTensor(Feature: Tensor, Mode: int=1) -> Tensor: # shuffle multiple tensors with the same indexs # all tensors must have the same shape if isinstance(Feature, Tensor): Feature = [Feature] Indexs = None Output = [] for f in Feature: # not in-place operation, should update output B, C, H, W = f.shape if Mode == 1: # fully shuffle f = f.flatten(2) if Indexs is None: Indexs = torch.randperm(f.shape[-1], device=f.device) f = f[:, :, Indexs.to(f.device)] f = f.reshape(B, C, H, W) else: # shuflle along y and then x axis if Indexs is None: Indexs = [torch.randperm(H, device=f.device), torch.randperm(W, device=f.device)] f = f[:, :, Indexs[0].to(f.device)] f = f[:, :, :, Indexs[1].to(f.device)] Output.append(f) return Output class AdaptiveAvgPool2d(nn.AdaptiveAvgPool2d): def init(self, output_size: Union[int, tuple] = 1 ): super(AdaptiveAvgPool2d, self).init(output_size=output_size) def profileModule(self, Input: Tensor): Output = self.forward(Input) return Output, 0.0, 0.0 class AdaptiveMaxPool2d(nn.AdaptiveMaxPool2d): def init(self, output_size: Union[int, tuple] = 1): super(AdaptiveMaxPool2d, self).init(output_size=output_size) def profileModule(self, Input: Tensor): Output = self.forward(Input) return Output, 0.0, 0.0 NormLayerTuple = ( nn.BatchNorm1d, nn.BatchNorm2d, nn.SyncBatchNorm, nn.LayerNorm, nn.InstanceNorm1d, nn.InstanceNorm2d, nn.GroupNorm, nn.BatchNorm3d, ) def initWeight(Module): # init conv, norm , and linear layers ## empty module if Module is None: return ## conv layer elif isinstance(Module, (nn.Conv2d, nn.Conv3d, nn.ConvTranspose2d)): nn.init.kaiming_uniform_(Module.weight, a=math.sqrt(5)) if Module.bias is not None: fan_in, _ = nn.init._calculate_fan_in_and_fan_out(Module.weight) if fan_in != 0: bound = 1 / math.sqrt(fan_in) nn.init.uniform_(Module.bias, -bound, bound) ## norm layer elif isinstance(Module, NormLayerTuple): if Module.weight is not None: nn.init.ones_(Module.weight) if Module.bias is not None: nn.init.zeros_(Module.bias) ## linear layer elif isinstance(Module, nn.Linear): nn.init.kaiming_uniform_(Module.weight, a=math.sqrt(5)) if Module.bias is not None: fan_in, _ = nn.init._calculate_fan_in_and_fan_out(Module.weight) bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 nn.init.uniform_(Module.bias, -bound, bound) elif isinstance(Module, (nn.Sequential, nn.ModuleList)): for m in Module: initWeight(m) elif list(Module.children()): for m in Module.children(): initWeight(m) class BaseConv2d(nn.Module): def init( self, in_channels: int, out_channels: int, kernel_size: int, stride: Optional[int] = 1, padding: Optional[int] = None, groups: Optional[int] = 1, bias: Optional[bool] = None, BNorm: bool = False, # norm_layer: Optional[Callable[..., nn.Module]]=nn.BatchNorm2d, ActLayer: Optional[Callable[..., nn.Module]] = None, dilation: int = 1, Momentum: Optional[float] = 0.1, **kwargs: Any ) -> None: super(BaseConv2d, self).init() if padding is None: padding = int((kernel_size - 1) // 2 * dilation) if bias is None: bias = not BNorm self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = kernel_size self.stride = stride self.padding = padding self.groups = groups self.bias = bias self.Conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, **kwargs) self.Bn = nn.BatchNorm2d(out_channels, eps=0.001, momentum=Momentum) if BNorm else nn.Identity() if ActLayer is not None: if isinstance(list(ActLayer().named_modules())[0][1], nn.Sigmoid): self.Act = ActLayer() else: self.Act = ActLayer(inplace=True) else: self.Act = ActLayer self.apply(initWeight) def forward(self, x: Tensor) -> Tensor: x = self.Conv(x) x = self.Bn(x) if self.Act is not None: x = self.Act(x) return x def profileModule(self, Input: Tensor): if Input.dim() != 4: print('Conv2d requires 4-dimensional Input (BxCxHxW). Provided Input has shape: {}'.format(Input.size())) BatchSize, in_channels, in_h, in_w = Input.size() assert in_channels == self.in_channels, '{}!={}'.format(in_channels, self.in_channels) k_h, k_w = pair(self.kernel_size) stride_h, stride_w = pair(self.stride) pad_h, pad_w = pair(self.padding) groups = self.groups out_h = (in_h - k_h + 2 * pad_h) // stride_h + 1 out_w = (in_w - k_w + 2 * pad_w) // stride_w + 1 # compute MACs MACs = (k_h * k_w) * (in_channels * self.out_channels) * (out_h * out_w) * 1.0 MACs /= groups if self.bias: MACs += self.out_channels * out_h * out_w # compute parameters Params = sum([p.numel() for p in self.parameters()]) Output = torch.zeros(size=(BatchSize, self.out_channels, out_h, out_w), dtype=Input.dtype, device=Input.device) # print(MACs) return Output, Params, MACs class MoCAttention(nn.Module): # Monte carlo attention def init( self, InChannels: int, HidChannels: int=None, SqueezeFactor: int=4, PoolRes: list=[1, 2, 3], Act: Callable[..., nn.Module]=nn.ReLU, ScaleAct: Callable[..., nn.Module]=nn.Sigmoid, MoCOrder: bool=True, **kwargs: Any, ) -> None: super().init() if HidChannels is None: HidChannels = max(makeDivisible(InChannels // SqueezeFactor, 8), 32) AllPoolRes = PoolRes + [1] if 1 not in PoolRes else PoolRes for k in AllPoolRes: Pooling = AdaptiveAvgPool2d(k) setMethod(self, 'Pool%d' % k, Pooling) self.SELayer = nn.Sequential( BaseConv2d(InChannels, HidChannels, 1, ActLayer=Act), BaseConv2d(HidChannels, InChannels, 1, ActLayer=ScaleAct), ) self.PoolRes = PoolRes self.MoCOrder = MoCOrder def monteCarloSample(self, x: Tensor) -> Tensor: if self.training: PoolKeep = np.random.choice(self.PoolRes) x1 = shuffleTensor(x)[0] if self.MoCOrder else x AttnMap: Tensor = callMethod(self, 'Pool%d' % PoolKeep)(x1) if AttnMap.shape[-1] > 1: AttnMap = AttnMap.flatten(2) AttnMap = AttnMap[:, :, torch.randperm(AttnMap.shape[-1])[0]] AttnMap = AttnMap[:, :, None, None] # squeeze twice else: AttnMap: Tensor = callMethod(self, 'Pool%d' % 1)(x) return AttnMap def forward(self, x: Tensor) -> Tensor: AttnMap = self.monteCarloSample(x) return x * self.SELayer(AttnMap) class Conv(nn.Module): """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation).""" default_act = nn.SiLU() def init(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True): super().init() self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False) self.bn = nn.BatchNorm2d(c2) self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() def forward(self, x): return self.act(self.bn(self.conv(x))) class RepMCABottleneck(nn.Module): """Attentional Gated Convolution Bottleneck with RepVGG and MoCAttention.""" def init(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5): """ Args: c1 (int): Input channels c2 (int): Output channels shortcut (bool): Whether to use shortcut connection g (int): Groups for convolutions k (tuple): Kernel sizes for convolutions (k1, k2) e (float): Expansion ratio for intermediate channels """ super().init() c_ = int(c2 * e) # Intermediate channels # Attention module self.att = MoCAttention(InChannels=c1) # Extract individual kernel sizes from tuple k1, k2 = k # First RepVGG convolution self.repvgg1 = RepVGG(in_channels=c1, out_channels=c1, kernel_size=k1, padding=k1//2) # Use k1 # Additional convolution branch self.conv_branch = Conv(c1, c2, 1) # 1x1 convolution # Second RepVGG convolution self.repvgg2 = RepVGG(in_channels=c1, out_channels=c2, kernel_size=k2, padding=k2//2) # Use k2 # Shortcut handling self.add = shortcut and c1 == c2 if shortcut and c1 != c2: # Adjust dimensions if needed self.shortcut_conv = Conv(c1, c2, 1) # 1x1 conv for channel adjustment else: self.shortcut_conv = nn.Identity() def forward(self, x): # Apply attention att_out = self.att(x) # First RepVGG convolution repvgg1_out = self.repvgg1(att_out) # Additional convolution branch conv_branch_out = self.conv_branch(att_out) # Second RepVGG convolution repvgg2_out = self.repvgg2(repvgg1_out) # Combine outputs combined = repvgg2_out + conv_branch_out # Shortcut connection if self.add: return combined + self.shortcut_conv(x) return combined class C2f(nn.Module): """Faster Implementation of CSP Bottleneck with 2 convolutions.""" def init(self, c1, c2, n=1, shortcut=False, g=1, e=0.5): """Initializes a CSP bottleneck with 2 convolutions and n Bottleneck blocks for faster processing.""" super().init() self.c = int(c2 * e) # hidden channels self.cv1 = Conv(c1, 2 * self.c, 1, 1) self.cv2 = Conv((2 + n) * self.c, c2, 1) # optional act=FReLU(c2) self.m = nn.ModuleList(RepMCABottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n)) def forward(self, x): """Forward pass through C2f layer.""" y = list(self.cv1(x).chunk(2, 1)) y.extend(m(y[-1]) for m in self.m) return self.cv2(torch.cat(y, 1)) def forward_split(self, x): """Forward pass using split() instead of chunk().""" y = list(self.cv1(x).split((self.c, self.c), 1)) y.extend(m(y[-1]) for m in self.m) return self.cv2(torch.cat(y, 1)) class C3(nn.Module): """CSP Bottleneck with 3 convolutions.""" def init(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): """Initialize the CSP Bottleneck with given channels, number, shortcut, groups, and expansion values.""" super().init() c_ = int(c2 * e) # hidden channels self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = Conv(c1, c_, 1, 1) self.cv3 = Conv(2 * c_, c2, 1) # optional act=FReLU(c2) self.m = nn.Sequential((RepMCABottleneck(c_, c_, shortcut, g, k=((1, 1), (3, 3)), e=1.0) for _ in range(n))) def forward(self, x): """Forward pass through the CSP bottleneck with 2 convolutions.""" return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1)) class C3k2_RepMCABottleneck(C2f): """Faster Implementation of CSP Bottleneck with 2 convolutions.""" def init(self, c1, c2, n=1, c3k=False, e=0.5, g=1, shortcut=True): """Initializes the C3k2 module, a faster CSP Bottleneck with 2 convolutions and optional C3k blocks.""" super().init(c1, c2, n, shortcut, g, e) self.m = nn.ModuleList( C3k(self.c, self.c, 2, shortcut, g) if c3k else RepMCABottleneck(self.c, self.c, shortcut, g) for _ in range(n) ) class C3k(C3): """C3k is a CSP bottleneck module with customizable kernel sizes for feature extraction in neural networks.""" def init(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, k=3): """Initializes the C3k module with specified channels, number of layers, and configurations.""" super().init(c1, c2, n, shortcut, g, e) c_ = int(c2 e) # hidden channels # Create a tuple of kernel sizes (k, k) for RepMCABottleneck self.m = nn.Sequential(*(RepMCABottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n))) # Add to module exports all = ['C3k2_RepMCABottleneck'] 报错：TypeError: unsupported operand type(s) for //: 'tuple' and 'int'

self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=self.stride, padding=1, bias=False) 然后在需要整数的地方使用self._stride_int。综上所述，请检查你的代码中所有使用步长...

def gcd(a, b): while b: a, b = b, a % b return a # Other types of layers can go here (e.g., nn.Linear, etc.) def _init_weights(module, name, scheme=''): if isinstance(module, nn.Conv2d) or isinstance(module, nn.Conv3d): if scheme == 'normal': nn.init.normal_(module.weight, std=.02) if module.bias is not None: nn.init.zeros_(module.bias) elif scheme == 'trunc_normal': trunc_normal_tf_(module.weight, std=.02) if module.bias is not None: nn.init.zeros_(module.bias) elif scheme == 'xavier_normal': nn.init.xavier_normal_(module.weight) if module.bias is not None: nn.init.zeros_(module.bias) elif scheme == 'kaiming_normal': nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu') if module.bias is not None: nn.init.zeros_(module.bias) else: # efficientnet like fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels fan_out //= module.groups nn.init.normal_(module.weight, 0, math.sqrt(2.0 / fan_out)) if module.bias is not None: nn.init.zeros_(module.bias) elif isinstance(module, nn.BatchNorm2d) or isinstance(module, nn.BatchNorm3d): nn.init.constant_(module.weight, 1) nn.init.constant_(module.bias, 0) elif isinstance(module, nn.LayerNorm): nn.init.constant_(module.weight, 1) nn.init.constant_(module.bias, 0) def act_layer(act, inplace=False, neg_slope=0.2, n_prelu=1): # activation layer act = act.lower() if act == 'relu': layer = nn.ReLU(inplace) elif act == 'relu6': layer = nn.ReLU6(inplace) elif act == 'leakyrelu': layer = nn.LeakyReLU(neg_slope, inplace) elif act == 'prelu': layer = nn.PReLU(num_parameters=n_prelu, init=neg_slope) elif act == 'gelu': layer = nn.GELU() elif act == 'hswish': layer = nn.Hardswish(inplace) else: raise NotImplementedError('activation layer [%s] is not found' % act) return layer def channel_shuffle(x, groups): batchsize, num_channels, height, width = x.data.size() channels_per_group = num_channels // groups # reshape x = x.view(batchsize, groups, channels_per_group, height, width) x = torch.transpose(x, 1, 2).contiguous() # flatten x = x.view(batchsize, -1, height, width) return x # Multi-scale depth-wise convolution (MSDC) class MSDC(nn.Module): def init(self, in_channels, kernel_sizes, stride, activation='relu6', dw_parallel=True): super(MSDC, self).init() self.in_channels = in_channels self.kernel_sizes = kernel_sizes self.activation = activation self.dw_parallel = dw_parallel self.dwconvs = nn.ModuleList([ nn.Sequential( nn.Conv2d(self.in_channels, self.in_channels, kernel_size, stride, kernel_size // 2, groups=self.in_channels, bias=False), nn.BatchNorm2d(self.in_channels), act_layer(self.activation, inplace=True) ) for kernel_size in self.kernel_sizes ]) self.init_weights('normal') def init_weights(self, scheme=''): named_apply(partial(_init_weights, scheme=scheme), self) def forward(self, x): # Apply the convolution layers in a loop outputs = [] for dwconv in self.dwconvs: dw_out = dwconv(x) outputs.append(dw_out) if self.dw_parallel == False: x = x + dw_out # You can return outputs based on what you intend to do with them return outputs class MSCB(nn.Module): """ Multi-scale convolution block (MSCB) """ def init(self, in_channels, out_channels, shortcut=False, stride=1, kernel_sizes=[1, 3, 5], expansion_factor=2, dw_parallel=True, activation='relu6'): super(MSCB, self).init() add = shortcut self.in_channels = in_channels self.out_channels = out_channels self.stride = stride self.kernel_sizes = kernel_sizes self.expansion_factor = expansion_factor self.dw_parallel = dw_parallel self.add = add self.activation = activation self.n_scales = len(self.kernel_sizes) # check stride value assert self.stride in [1, 2] # Skip connection if stride is 1 self.use_skip_connection = True if self.stride == 1 else False # expansion factor self.ex_channels = int(self.in_channels * self.expansion_factor) self.pconv1 = nn.Sequential( # pointwise convolution nn.Conv2d(self.in_channels, self.ex_channels, 1, 1, 0, bias=False), nn.BatchNorm2d(self.ex_channels), act_layer(self.activation, inplace=True) ) self.msdc = MSDC(self.ex_channels, self.kernel_sizes, self.stride, self.activation, dw_parallel=self.dw_parallel) if self.add == True: self.combined_channels = self.ex_channels * 1 else: self.combined_channels = self.ex_channels * self.n_scales self.pconv2 = nn.Sequential( # pointwise convolution nn.Conv2d(self.combined_channels, self.out_channels, 1, 1, 0, bias=False), nn.BatchNorm2d(self.out_channels), ) if self.use_skip_connection and (self.in_channels != self.out_channels): self.conv1x1 = nn.Conv2d(self.in_channels, self.out_channels, 1, 1, 0, bias=False) self.init_weights('normal') def init_weights(self, scheme=''): named_apply(partial(_init_weights, scheme=scheme), self) def forward(self, x): pout1 = self.pconv1(x) msdc_outs = self.msdc(pout1) if self.add == True: dout = 0 for dwout in msdc_outs: dout = dout + dwout else: dout = torch.cat(msdc_outs, dim=1) dout = channel_shuffle(dout, gcd(self.combined_channels, self.out_channels)) out = self.pconv2(dout) if self.use_skip_connection: if self.in_channels != self.out_channels: x = self.conv1x1(x) return x + out else: return out def autopad(k, p=None, d=1): # kernel, padding, dilation """Pad to 'same' shape outputs.""" if d > 1: k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size if p is None: p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad return p class Conv(nn.Module): """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation).""" default_act = nn.SiLU() # default activation def init(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True): """Initialize Conv layer with given arguments including activation.""" super().init() self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False) self.bn = nn.BatchNorm2d(c2) self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() def forward(self, x): """Apply convolution, batch normalization and activation to input tensor.""" return self.act(self.bn(self.conv(x))) def forward_fuse(self, x): """Perform transposed convolution of 2D data.""" return self.act(self.conv(x)) import torch from torch import nn from ultralytics.nn.modules.conv import Conv from torch.nn import functional as F class Channel_Att(nn.Module): def init(self, channels, t=16): super(Channel_Att, self).init() self.channels = channels self.bn2 = nn.BatchNorm2d(self.channels, affine=True) def forward(self, x): residual = x x = self.bn2(x) weight_bn = self.bn2.weight.data.abs() / torch.sum(self.bn2.weight.data.abs()) x = x.permute(0, 2, 3, 1).contiguous() x = torch.mul(weight_bn, x) x = x.permute(0, 3, 1, 2).contiguous() x = torch.sigmoid(x) * residual # return x class NAMAttention(nn.Module): def init(self, channels, shape, out_channels=None, no_spatial=True): super(NAMAttention, self).init() self.Channel_Att = Channel_Att(channels) def forward(self, x): x_out1 = self.Channel_Att(x) return x_out1 根据这些代码，参考Conv的结构，创建一个名为MSConv的模块，输入分为两个分支，第一个是MSDC模块到BatchNorm2d到SiLU，另一个是NAM注意力，注意力机制与其他三个模块并行，最后将SiLU的输出与NAM的输出合并为最终的输出。请编辑代码实现这个思路。注意NAM注意力机制的参数问题

self.bn = nn.BatchNorm2d(in_channels, affine=True) def forward(self, x): residual = x x = self.bn(x) # 提取gamma（缩放因子）和beta（偏移）？但是，他们只用了gamma。 bn_weights = self.bn.weight ...

from typing import Dict, Tuple import torch import torch.nn.functional as F import numpy as np from einops import rearrange from diffusers.schedulers.scheduling_ddpm import DDPMScheduler import inspect from controller.model.common.normalizer import LinearNormalizer from controller.policy.base_image_policy import BaseImagePolicy from controller.model.diffusion.transformer_for_action_diffusion import TransformerForActionDiffusion from controller.model.vision.obs_encoder import ObsEncoder from scipy.optimize import linear_sum_assignment import pickle # Adapted from https://github.com/lucidrains/pi-zero-pytorch/blob/e82fced40e55023a0ded22ab3bda495964353253/pi_zero_pytorch/pi_zero.py#L216 def noise_assignment(data, noise): device = data.device data, noise = tuple(rearrange(t, 'b ... -> b (...)') for t in (data, noise)) dist = torch.cdist(data, noise) _, assign = linear_sum_assignment(dist.cpu()) return torch.from_numpy(assign).to(device) class DexGraspVLAController(BaseImagePolicy): def init(self, shape_meta: dict, noise_scheduler: DDPMScheduler, obs_encoder: ObsEncoder, num_inference_steps=None, # arch n_layer=7, n_head=8, p_drop_attn=0.1, use_attn_mask=False, start_ckpt_path=None, # parameters passed to step kwargs): super().init() # parse shapes action_shape = shape_meta['action']['shape'] assert len(action_shape) == 1 action_dim = action_shape[0] action_horizon = shape_meta['action']['horizon'] obs_shape, obs_part_length = obs_encoder.output_shape() n_emb = obs_shape[-1] obs_tokens = obs_shape[-2] model = TransformerForActionDiffusion( input_dim=action_dim, output_dim=action_dim, action_horizon=action_horizon, n_layer=n_layer, n_head=n_head, n_emb=n_emb, max_cond_tokens=obs_tokens+1, # obs tokens + 1 token for time p_drop_attn=p_drop_attn, obs_part_length=obs_part_length, use_attn_mask=use_attn_mask ) self.obs_encoder = obs_encoder self.model = model self.noise_scheduler = noise_scheduler self.normalizer = LinearNormalizer() self.action_dim = action_dim self.action_horizon = action_horizon self.start_ckpt_path = start_ckpt_path self.kwargs = kwargs if num_inference_steps is None: num_inference_steps = noise_scheduler.config.num_train_timesteps self.num_inference_steps = num_inference_steps # ========= inference ============ def conditional_sample(self, cond=None, gen_attn_map=True, kwargs): model = self.model scheduler = self.noise_scheduler B = cond.shape[0] trajectory = torch.randn( size=(B, self.action_horizon, self.action_dim), dtype=self.dtype, device=self.device) # set step values scheduler.set_timesteps(self.num_inference_steps) # Store attention maps for all timesteps all_timestep_attention_maps = {} for t in scheduler.timesteps: # 1. predict model output model_output, attention_maps = model(trajectory, t, cond, training=False, gen_attn_map=gen_attn_map) all_timestep_attention_maps[t.cpu().item()] = attention_maps # 2. compute previous image: x_t -> x_t-1 trajectory = scheduler.step( model_output, t, trajectory, kwargs ).prev_sample return trajectory, all_timestep_attention_maps def predict_action(self, obs_dict: Dict[str, torch.Tensor], output_path: str = None) -> Dict[str, torch.Tensor]: """ obs_dict: must include "obs" key action_pred: predicted action """ assert 'past_action' not in obs_dict # not implemented yet # normalize input # nobs = self.normalizer.normalize(obs_dict) nobs = obs_dict B = next(iter(nobs.values())).shape[0] # process input obs_tokens = self.obs_encoder(nobs, training=False) # (B, N, n_emb) # run sampling nsample, all_timestep_attention_maps = self.conditional_sample( cond=obs_tokens, gen_attn_map=True if output_path is not None else False, self.kwargs) # unnormalize prediction assert nsample.shape == (B, self.action_horizon, self.action_dim) action_pred = self.normalizer['action'].unnormalize(nsample) if output_path is not None: # Convert tensors in obs_dict to numpy arrays obs_dict_numpy = {} for k, v in obs_dict.items(): if k in ['rgbm', 'right_cam_img']: obs_dict_numpy[k] = np.clip(v.detach().cpu().numpy() * 255, 0, 255).astype(np.uint8) else: obs_dict_numpy[k] = v.detach().cpu().numpy() obs_dict_numpy[k] = obs_dict_numpy[k][:2] save_dict = { 'attention_maps': all_timestep_attention_maps, 'obs_dict': obs_dict_numpy } with open(output_path, 'wb') as f: pickle.dump(save_dict, f) return action_pred # ========= training ============ def set_normalizer(self, normalizer: LinearNormalizer): self.normalizer.load_state_dict(normalizer.state_dict()) def get_optimizer( self, lr: float, weight_decay: float, betas: Tuple[float, float], ) -> torch.optim.Optimizer: # start with all of the candidate parameters (that require grad) param_dict = {pn: p for pn, p in self.named_parameters()} param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad} # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no. # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't. decay_params = [p for n, p in param_dict.items() if p.dim() >= 2] nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2] optim_groups = [ {'params': decay_params, 'weight_decay': weight_decay}, {'params': nodecay_params, 'weight_decay': 0.0} ] num_decay_params = sum(p.numel() for p in decay_params) num_nodecay_params = sum(p.numel() for p in nodecay_params) print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters") print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters") fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters print(f"Fused AdamW available: {fused_available}") optimizer = torch.optim.AdamW( optim_groups, lr=lr, betas=betas, fused=fused_available ) return optimizer def compute_loss(self, batch, training=True): # normalize input assert 'valid_mask' not in batch # nobs = self.normalizer.normalize(batch['obs']) nobs = batch['obs'] nactions = self.normalizer['action'].normalize(batch['action']) trajectory = nactions # process input obs_tokens = self.obs_encoder(nobs, training) # (B, N, n_emb) # Sample noise that we'll add to the images noise = torch.randn(trajectory.shape, device=trajectory.device) assignment = noise_assignment(trajectory, noise) noise = noise[assignment] # Sample a random timestep for each image timesteps = torch.randint( 0, self.noise_scheduler.config.num_train_timesteps, (nactions.shape[0],), device=trajectory.device ).long() # Add noise to the clean images according to the noise magnitude at each timestep # (this is the forward diffusion process) noisy_trajectory = self.noise_scheduler.add_noise( trajectory, noise, timesteps) # Predict the noise residual pred, _ = self.model( noisy_trajectory, timesteps, cond=obs_tokens, training=training, gen_attn_map=False ) pred_type = self.noise_scheduler.config.prediction_type if pred_type == 'epsilon': target = noise elif pred_type == 'sample': target = trajectory else: raise ValueError(f"Unsupported prediction type {pred_type}") loss = F.mse_loss(pred, target) return loss def forward(self, batch, training=True): return self.compute_loss(batch, training)全文注释

def predict_action(self, obs_dict: Dict[str, torch.Tensor], output_path: str = None) -> Dict[str, torch.Tensor]: """基于观测预测动作序列 Args: obs_dict: 包含观测的字典（必须包含"obs"键） output...

from typing import Dict, Tuple, Union import torch import torch.nn.functional as F import numpy as np from einops import rearrange from diffusers.schedulers.scheduling_ddpm import DDPMScheduler import inspect from controller.model.common.normalizer import LinearNormalizer from controller.policy.base_image_policy import BaseImagePolicy from controller.model.diffusion.transformer_for_action_diffusion import ( TransformerForActionDiffusion, ) from controller.model.vision.obs_encoder import ObsEncoder from controller.model.vision.obs_encoder_without_mask import ObsEncoderWithoutMask from scipy.optimize import linear_sum_assignment import pickle # Adapted from https://github.com/lucidrains/pi-zero-pytorch/blob/e82fced40e55023a0ded22ab3bda495964353253/pi_zero_pytorch/pi_zero.py#L216 def noise_assignment(data, noise): device = data.device data, noise = tuple(rearrange(t, "b ... -> b (...)") for t in (data, noise)) dist = torch.cdist(data, noise) _, assign = linear_sum_assignment(dist.cpu()) return torch.from_numpy(assign).to(device) class DexGraspVLAController(BaseImagePolicy): def init( self, shape_meta: dict, noise_scheduler: DDPMScheduler, obs_encoder: Union[ObsEncoder, ObsEncoderWithoutMask], num_inference_steps=None, # arch n_layer=7, n_head=8, p_drop_attn=0.1, use_attn_mask=False, start_ckpt_path=None, # parameters passed to step kwargs, ): super().init() # parse shapes action_shape = shape_meta["action"]["shape"] assert len(action_shape) == 1 action_dim = action_shape[0] action_horizon = shape_meta["action"]["horizon"] obs_shape, obs_part_length = obs_encoder.output_shape() n_emb = obs_shape[-1] obs_tokens = obs_shape[-2] model = TransformerForActionDiffusion( input_dim=action_dim, output_dim=action_dim, action_horizon=action_horizon, n_layer=n_layer, n_head=n_head, n_emb=n_emb, max_cond_tokens=obs_tokens + 1, # obs tokens + 1 token for time p_drop_attn=p_drop_attn, obs_part_length=obs_part_length, use_attn_mask=use_attn_mask, ) self.obs_encoder = obs_encoder self.model = model self.noise_scheduler = noise_scheduler self.normalizer = LinearNormalizer() self.action_dim = action_dim self.action_horizon = action_horizon self.start_ckpt_path = start_ckpt_path self.kwargs = kwargs if num_inference_steps is None: num_inference_steps = noise_scheduler.config.num_train_timesteps self.num_inference_steps = num_inference_steps # ========= inference ============ def conditional_sample(self, cond=None, gen_attn_map=True, kwargs): model = self.model scheduler = self.noise_scheduler B = cond.shape[0] trajectory = torch.randn( size=(B, self.action_horizon, self.action_dim), dtype=self.dtype, device=self.device, ) # set step values scheduler.set_timesteps(self.num_inference_steps) # Store attention maps for all timesteps all_timestep_attention_maps = {} # 从 num_inference_steps - 1 到 0 for t in scheduler.timesteps: # 1. predict model output model_output, attention_maps = model( trajectory, t, cond, training=False, gen_attn_map=gen_attn_map ) all_timestep_attention_maps[t.cpu().item()] = attention_maps # 2. compute previous image: x_t -> x_t-1 trajectory = scheduler.step( model_output, t, trajectory, kwargs ).prev_sample return trajectory, all_timestep_attention_maps def predict_action( self, obs_dict: Dict[str, torch.Tensor], output_path: str = None ) -> Dict[str, torch.Tensor]: """ obs_dict: must include "obs" key action_pred: predicted action """ assert "past_action" not in obs_dict # not implemented yet # normalize input # nobs = self.normalizer.normalize(obs_dict) nobs = obs_dict B = next(iter(nobs.values())).shape[0] # process input obs_tokens = self.obs_encoder(nobs, training=False) # (B, N, n_emb) # run sampling nsample, all_timestep_attention_maps = self.conditional_sample( cond=obs_tokens, gen_attn_map=True if output_path is not None else False, self.kwargs, ) # unnormalize prediction assert nsample.shape == (B, self.action_horizon, self.action_dim) action_pred = self.normalizer["action"].unnormalize(nsample) if output_path is not None: # Convert tensors in obs_dict to numpy arrays obs_dict_numpy = {} for k, v in obs_dict.items(): if k in ["rgbm", "right_cam_img", "rgb"]: obs_dict_numpy[k] = np.clip( v.detach().cpu().numpy() * 255, 0, 255 ).astype(np.uint8) else: obs_dict_numpy[k] = v.detach().cpu().numpy() obs_dict_numpy[k] = obs_dict_numpy[k][:2] save_dict = { "attention_maps": all_timestep_attention_maps, "obs_dict": obs_dict_numpy, } with open(output_path, "wb") as f: pickle.dump(save_dict, f) return action_pred # ========= training ============ def set_normalizer(self, normalizer: LinearNormalizer): self.normalizer.load_state_dict(normalizer.state_dict()) def get_optimizer( self, lr: float, weight_decay: float, betas: Tuple[float, float], ) -> torch.optim.Optimizer: # start with all of the candidate parameters (that require grad) param_dict = {pn: p for pn, p in self.named_parameters()} param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad} # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no. # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't. decay_params = [p for n, p in param_dict.items() if p.dim() >= 2] nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2] optim_groups = [ {"params": decay_params, "weight_decay": weight_decay}, {"params": nodecay_params, "weight_decay": 0.0}, ] num_decay_params = sum(p.numel() for p in decay_params) num_nodecay_params = sum(p.numel() for p in nodecay_params) print( f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters" ) print( f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters" ) fused_available = "fused" in inspect.signature(torch.optim.AdamW).parameters print(f"Fused AdamW available: {fused_available}") optimizer = torch.optim.AdamW( optim_groups, lr=lr, betas=betas, fused=fused_available ) return optimizer def compute_loss(self, batch, training=True): # normalize input assert "valid_mask" not in batch # nobs = self.normalizer.normalize(batch['obs']) nobs = batch["obs"] nactions = self.normalizer["action"].normalize(batch["action"]) trajectory = nactions # process input obs_tokens = self.obs_encoder(nobs, training) # (B, N, n_emb) # Sample noise that we'll add to the images noise = torch.randn(trajectory.shape, device=trajectory.device) assignment = noise_assignment(trajectory, noise) noise = noise[assignment] # Sample a random timestep for each image timesteps = torch.randint( 0, self.noise_scheduler.config.num_train_timesteps, (nactions.shape[0],), device=trajectory.device, ).long() # Add noise to the clean images according to the noise magnitude at each timestep # (this is the forward diffusion process) noisy_trajectory = self.noise_scheduler.add_noise(trajectory, noise, timesteps) # Predict the noise residual pred, _ = self.model( noisy_trajectory, timesteps, cond=obs_tokens, training=training, gen_attn_map=False, ) pred_type = self.noise_scheduler.config.prediction_type if pred_type == "epsilon": target = noise elif pred_type == "sample": target = trajectory else: raise ValueError(f"Unsupported prediction type {pred_type}") loss = F.mse_loss(pred, target) return loss def forward(self, batch, training=True): return self.compute_loss(batch, training) 全文注释

self, obs_dict: Dict[str, torch.Tensor], output_path: str = None ) -> Dict[str, torch.Tensor]: """基于观测预测动作序列 Args: obs_dict: 包含观测的字典 output_path: 注意力图输出路径 Returns: ...

这是main.py文件的代码：from datetime import datetime from functools import partial from PIL import Image import cv2 import numpy as np from torch.utils.data import DataLoader from torch.version import cuda from torchvision import transforms from torchvision.datasets import CIFAR10 from torchvision.models import resnet from tqdm import tqdm import argparse import json import math import os import pandas as pd import torch import torch.nn as nn import torch.nn.functional as F #数据增强（核心增强部分） import torch from torchvision import transforms from torch.utils.data import Dataset, DataLoader # 设置参数 parser = argparse.ArgumentParser(description='Train MoCo on CIFAR-10') parser.add_argument('-a', '--arch', default='resnet18') # lr: 0.06 for batch 512 (or 0.03 for batch 256) parser.add_argument('--lr', '--learning-rate', default=0.06, type=float, metavar='LR', help='initial learning rate', dest='lr') parser.add_argument('--epochs', default=300, type=int, metavar='N', help='number of total epochs to run') parser.add_argument('--schedule', default=[120, 160], nargs='', type=int, help='learning rate schedule (when to drop lr by 10x); does not take effect if --cos is on') parser.add_argument('--cos', action='store_true', help='use cosine lr schedule') parser.add_argument('--batch-size', default=64, type=int, metavar='N', help='mini-batch size') parser.add_argument('--wd', default=5e-4, type=float, metavar='W', help='weight decay') # moco specific configs: parser.add_argument('--moco-dim', default=128, type=int, help='feature dimension') parser.add_argument('--moco-k', default=4096, type=int, help='queue size; number of negative keys') parser.add_argument('--moco-m', default=0.99, type=float, help='moco momentum of updating key encoder') parser.add_argument('--moco-t', default=0.1, type=float, help='softmax temperature') parser.add_argument('--bn-splits', default=8, type=int, help='simulate multi-gpu behavior of BatchNorm in one gpu; 1 is SyncBatchNorm in multi-gpu') parser.add_argument('--symmetric', action='store_true', help='use a symmetric loss function that backprops to both crops') # knn monitor parser.add_argument('--knn-k', default=20, type=int, help='k in kNN monitor') parser.add_argument('--knn-t', default=0.1, type=float, help='softmax temperature in kNN monitor; could be different with moco-t') # utils parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--results-dir', default='', type=str, metavar='PATH', help='path to cache (default: none)') ''' args = parser.parse_args() # running in command line ''' args = parser.parse_args('') # running in ipynb # set command line arguments here when running in ipynb args.epochs = 300 # 修改处 args.cos = True args.schedule = [] # cos in use args.symmetric = False if args.results_dir == '': args.results_dir = "E:\\contrast\\yolov8\\MoCo\\run\\cache-" + datetime.now().strftime("%Y-%m-%d-%H-%M-%S-moco") moco_args = args class CIFAR10Pair(CIFAR10): def getitem(self, index): img = self.data[index] img = Image.fromarray(img) # 原始图像增强 im_1 = self.transform(img) im_2 = self.transform(img) # 退化增强生成额外视图 degraded_results = image_degradation_and_augmentation(img) im_3 = self.transform(Image.fromarray(degraded_results['augmented_images'][0])) # 选择第一组退化增强 im_4 = self.transform(Image.fromarray(degraded_results['cutmix_image'])) return im_1, im_2, im_3, im_4 # 返回原始增强+退化增强 # 定义数据加载器 # class CIFAR10Pair(CIFAR10): # """CIFAR10 Dataset. # """ # def getitem(self, index): # img = self.data[index] # img = Image.fromarray(img) # if self.transform is not None: # im_1 = self.transform(img) # im_2 = self.transform(img) # return im_1, im_2 import cv2 import numpy as np import random def apply_interpolation_degradation(img, method): """ 应用插值退化参数: img: 输入图像(numpy数组) method: 插值方法('nearest', 'bilinear', 'bicubic') 返回: 退化后的图像 """ # 获取图像尺寸 h, w = img.shape[:2] # 应用插值方法 if method == 'nearest': # 最近邻退化: 下采样+上采样 downsampled = cv2.resize(img, (w//2, h//2), interpolation=cv2.INTER_NEAREST) degraded = cv2.resize(downsampled, (w, h), interpolation=cv2.INTER_NEAREST) elif method == 'bilinear': # 双线性退化: 下采样+上采样 downsampled = cv2.resize(img, (w//2, h//2), interpolation=cv2.INTER_LINEAR) degraded = cv2.resize(downsampled, (w, h), interpolation=cv2.INTER_LINEAR) elif method == 'bicubic': # 双三次退化: 下采样+上采样 downsampled = cv2.resize(img, (w//2, h//2), interpolation=cv2.INTER_CUBIC) degraded = cv2.resize(downsampled, (w, h), interpolation=cv2.INTER_CUBIC) else: degraded = img return degraded def darken_image(img, intensity=0.3): """ 应用黑暗处理 - 降低图像亮度并增加暗区对比度参数: img: 输入图像(numpy数组) intensity: 黑暗强度 (0.1-0.9) 返回: 黑暗处理后的图像 """ # 限制强度范围 intensity = max(0.1, min(0.9, intensity)) # 将图像转换为HSV颜色空间 hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV).astype(np.float32) # 降低亮度（V通道） hsv[:, :, 2] = hsv[:, :, 2] intensity # 增加暗区的对比度 - 使用gamma校正 gamma = 1.0 + (1.0 - intensity) # 黑暗强度越大，gamma值越大 hsv[:, :, 2] = np.power(hsv[:, :, 2]/255.0, gamma) * 255.0 # 限制值在0-255范围内 hsv[:, :, 2] = np.clip(hsv[:, :, 2], 0, 255) # 转换回RGB return cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2RGB) def random_affine(image): """ 随机仿射变换（缩放和平移）参数: image: 输入图像(numpy数组) 返回: 变换后的图像 """ height, width = image.shape[:2] # 随机缩放因子 (0.8 to 1.2) scale = random.uniform(0.8, 1.2) # 随机平移 (10% of image size) max_trans = 0.1 * min(width, height) tx = random.randint(-int(max_trans), int(max_trans)) ty = random.randint(-int(max_trans), int(max_trans)) # 变换矩阵 M = np.array([[scale, 0, tx], [0, scale, ty]], dtype=np.float32) # 应用仿射变换 transformed = cv2.warpAffine(image, M, (width, height)) return transformed def augment_hsv(image, h_gain=0.1, s_gain=0.5, v_gain=0.5): """ HSV色彩空间增强参数: image: 输入图像(numpy数组) h_gain, s_gain, v_gain: 各通道的增益范围返回: 增强后的图像 """ # 限制增益范围 h_gain = max(-0.1, min(0.1, random.uniform(-h_gain, h_gain))) s_gain = max(0.5, min(1.5, random.uniform(1-s_gain, 1+s_gain))) v_gain = max(0.5, min(1.5, random.uniform(1-v_gain, 1+v_gain))) # 转换为HSV hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV).astype(np.float32) # 应用增益 hsv[:, :, 0] = (hsv[:, :, 0] * (1 + h_gain)) % 180 hsv[:, :, 1] = np.clip(hsv[:, :, 1] * s_gain, 0, 255) hsv[:, :, 2] = np.clip(hsv[:, :, 2] * v_gain, 0, 255) # 转换回RGB return cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2RGB) # def mixup(img1, img2, alpha=0.6): # """ # 将两幅图像混合在一起 # 参数: # img1, img2: 输入图像(numpy数组) # alpha: Beta分布的参数，控制混合比例 # 返回: # 混合后的图像 # """ # # 生成混合比例 # lam = random.betavariate(alpha, alpha) # # 确保图像尺寸相同 # if img1.shape != img2.shape: # img2 = cv2.resize(img2, (img1.shape[1], img1.shape[0])) # # 混合图像 # mixed = (lam * img1.astype(np.float32) + (1 - lam) * img2.astype(np.float32)).astype(np.uint8) # return mixed # def image_degradation_and_augmentation(image,dark_intensity=0.3): # """ # 完整的图像退化和增强流程 # 参数: # image: 输入图像(PIL.Image或numpy数组) # 返回: # dict: 包含所有退化组和最终增强结果的字典 # """ # # 确保输入是numpy数组 # if not isinstance(image, np.ndarray): # image = np.array(image) # # 确保图像为RGB格式 # if len(image.shape) == 2: # image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB) # elif image.shape[2] == 4: # image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB) # # 原始图像 # original = image.copy() # # 插值方法列表 # interpolation_methods = ['nearest', 'bilinear', 'bicubic'] # # 第一组退化: 三种插值方法 # group1 = [] # for method in interpolation_methods: # degraded = apply_interpolation_degradation(original, method) # group1.append(degraded) # # 第二组退化: 随机额外退化 # group2 = [] # for img in group1: # # 随机选择一种退化方法 # method = random.choice(interpolation_methods) # extra_degraded = apply_interpolation_degradation(img, method) # group2.append(extra_degraded) # # 所有退化图像组合 # all_degraded_images = [original] + group1 + group2 # # 应用黑暗处理 (在增强之前) # darkened_images = [darken_image(img, intensity=dark_intensity) for img in all_degraded_images] # # 应用数据增强 # # 1. 随机仿射变换 # affine_images = [random_affine(img) for img in darkened_images] # # 2. HSV增强 # hsv_images = [augment_hsv(img) for img in affine_images] # # 3. MixUp增强 # # 随机选择两个增强后的图像进行混合 # mixed_image = mixup( # random.choice(hsv_images), # random.choice(hsv_images) # ) # # 返回结果 # results = { # 'original': original, # 'degraded_group1': group1, # 第一组退化图像 # 'degraded_group2': group2, # 第二组退化图像 # 'augmented_images': hsv_images, # 所有增强后的图像（原始+六组退化） # 'mixup_image': mixed_image # MixUp混合图像 # } # return results # # def add_gaussian_noise(image, mean=0, sigma=25): # # """添加高斯噪声""" # # noise = np.random.normal(mean, sigma, image.shape) # # noisy = np.clip(image + noise, 0, 255).astype(np.uint8) # # return noisy # # def random_cutout(image, max_holes=3, max_height=16, max_width=16): # # """随机CutOut增强""" # # h, w = image.shape[:2] # # for _ in range(random.randint(1, max_holes)): # # hole_h = random.randint(1, max_height) # # hole_w = random.randint(1, max_width) # # y = random.randint(0, h - hole_h) # # x = random.randint(0, w - hole_w) # # image[y:y+hole_h, x:x+hole_w] = 0 # # return image import cv2 import numpy as np import random from matplotlib import pyplot as plt import pywt def wavelet_degradation(image, level=0.5): """小波系数衰减退化""" # 小波分解 coeffs = pywt.dwt2(image, 'haar') cA, (cH, cV, cD) = coeffs # 衰减高频系数 cH = cH * level cV = cV * level cD = cD * level # 重建图像 return pywt.idwt2((cA, (cH, cV, cD)), 'haar')[:image.shape[0], :image.shape[1]] def adaptive_interpolation_degradation(image): """自适应插值退化（随机选择最近邻或双三次插值）""" if random.choice([True, False]): method = cv2.INTER_NEAREST # 最近邻插值 else: method = cv2.INTER_CUBIC # 双三次插值 # 先缩小再放大 scale_factor = random.uniform(0.3, 0.8) small = cv2.resize(image, None, fx=scale_factor, fy=scale_factor, interpolation=method) return cv2.resize(small, (image.shape[1], image.shape[0]), interpolation=method) def bilinear_degradation(image): """双线性插值退化""" # 先缩小再放大 scale_factor = random.uniform(0.3, 0.8) small = cv2.resize(image, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR) return cv2.resize(small, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_LINEAR) def cutmix(img1, img2, bboxes1=None, bboxes2=None, beta=1.0): """ 参数: img1: 第一张输入图像(numpy数组) img2: 第二张输入图像(numpy数组) bboxes1: 第一张图像的边界框(可选) bboxes2: 第二张图像的边界框(可选) beta: Beta分布的参数，控制裁剪区域的大小返回: 混合后的图像和边界框(如果有) """ # 确保图像尺寸相同 if img1.shape != img2.shape: img2 = cv2.resize(img2, (img1.shape[1], img1.shape[0])) h, w = img1.shape[:2] # 生成裁剪区域的lambda值(混合比例) lam = np.random.beta(beta, beta) # 计算裁剪区域的宽高 cut_ratio = np.sqrt(1. - lam) cut_w = int(w * cut_ratio) cut_h = int(h * cut_ratio) # 随机确定裁剪区域的中心点 cx = np.random.randint(w) cy = np.random.randint(h) # 计算裁剪区域的边界 x1 = np.clip(cx - cut_w // 2, 0, w) y1 = np.clip(cy - cut_h // 2, 0, h) x2 = np.clip(cx + cut_w // 2, 0, w) y2 = np.clip(cy + cut_h // 2, 0, h) # 执行CutMix操作 mixed_img = img1.copy() mixed_img[y1:y2, x1:x2] = img2[y1:y2, x1:x2] # 计算实际的混合比例 lam = 1 - ((x2 - x1) * (y2 - y1) / (w * h)) # 处理边界框(如果有) mixed_bboxes = None if bboxes1 is not None and bboxes2 is not None: mixed_bboxes = [] # 添加第一张图像的边界框 for bbox in bboxes1: mixed_bboxes.append(bbox + [lam]) # 添加混合权重 # 添加第二张图像的边界框(只添加在裁剪区域内的) for bbox in bboxes2: # 检查边界框是否在裁剪区域内 bbox_x_center = (bbox[0] + bbox[2]) / 2 bbox_y_center = (bbox[1] + bbox[3]) / 2 if (x1 <= bbox_x_center <= x2) and (y1 <= bbox_y_center <= y2): mixed_bboxes.append(bbox + [1 - lam]) return mixed_img, mixed_bboxes def image_degradation_and_augmentation(image, bboxes=None): """ 完整的图像退化和增强流程(修改为使用CutMix) 参数: image: 输入图像(PIL.Image或numpy数组) bboxes: 边界框(可选) 返回: dict: 包含所有退化组和最终增强结果的字典 """ # 确保输入是numpy数组 if not isinstance(image, np.ndarray): image = np.array(image) # 确保图像为RGB格式 if len(image.shape) == 2: image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB) elif image.shape[2] == 4: image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB) degraded_sets = [] original = image.copy() # 第一组退化：三种基础退化 degraded_sets.append(wavelet_degradation(original.copy())) degraded_sets.append(degraded_sets) degraded_sets.append(adaptive_interpolation_degradation(original.copy())) degraded_sets.append(degraded_sets) degraded_sets.append(bilinear_degradation(original.copy())) degraded_sets.append(degraded_sets) # # 原始图像 # original = image.copy() # # 插值方法列表 # interpolation_methods = ['nearest', 'bilinear', 'bicubic'] # # 第一组退化: 三种插值方法 # group1 = [] # for method in interpolation_methods: # degraded = apply_interpolation_degradation(original, method) # group1.append(degraded) # 第二组退化: 随机额外退化 # group2 = [] # for img in group1: # # 随机选择一种退化方法 # method = random.choice(interpolation_methods) # extra_degraded = apply_interpolation_degradation(img, method) # group2.append(extra_degraded) # 第二组退化：随机选择再退化 methods = [wavelet_degradation, adaptive_interpolation_degradation, bilinear_degradation] group2=[] for img in degraded_sets: selected_method = random.choice(methods) group2.append(selected_method(img)) group2.append(group2) # 原始图像 original = image.copy() all_degraded_images = [original] + degraded_sets + group2 # 应用黑暗处理 dark_original = darken_image(original) dark_degraded = [darken_image(img) for img in all_degraded_images] # 合并原始和退化图像 all_images = [dark_original] + dark_degraded # 应用数据增强 # 1. 随机仿射变换 affine_images = [random_affine(img) for img in all_images] # 2. HSV增强 hsv_images = [augment_hsv(img) for img in affine_images] # 3. CutMix增强 # 随机选择两个增强后的图像进行混合 mixed_image, mixed_bboxes = cutmix( random.choice(hsv_images), random.choice(hsv_images), bboxes1=bboxes if bboxes is not None else None, bboxes2=bboxes if bboxes is not None else None ) # 返回结果 results = { 'original': original, 'degraded': dark_degraded, 'augmented_images': hsv_images, # 所有增强后的图像（原始+六组退化） 'cutmix_image': mixed_image, # CutMix混合图像 'cutmix_bboxes': mixed_bboxes if bboxes is not None else None # 混合后的边界框 } return results train_transform = transforms.Compose([ transforms.RandomResizedCrop(32), transforms.RandomHorizontalFlip(p=0.5), transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)], p=0.8), transforms.RandomGrayscale(p=0.2), transforms.ToTensor(), transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])]) test_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])]) # data_processing prepare train_data = CIFAR10Pair(root="E:/contrast/yolov8/MoCo/data_visdrone2019", train=True, transform=train_transform, download=False) moco_train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=0, pin_memory=True, drop_last=True) memory_data = CIFAR10(root="E:/contrast/yolov8/MoCo/data_visdrone2019", train=True, transform=test_transform, download=False) memory_loader = DataLoader(memory_data, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True) test_data = CIFAR10(root="E:/contrast/yolov8/MoCo/data_visdrone2019", train=False, transform=test_transform, download=False) test_loader = DataLoader(test_data, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True) # 定义基本编码器 # SplitBatchNorm: simulate multi-gpu behavior of BatchNorm in one gpu by splitting alone the batch dimension # implementation adapted from https://github.com/davidcpage/cifar10-fast/blob/master/torch_backend.py class SplitBatchNorm(nn.BatchNorm2d): def init(self, num_features, num_splits, kw): super().init(num_features, kw) self.num_splits = num_splits def forward(self, input): N, C, H, W = input.shape if self.training or not self.track_running_stats: running_mean_split = self.running_mean.repeat(self.num_splits) running_var_split = self.running_var.repeat(self.num_splits) outcome = nn.functional.batch_norm( input.view(-1, C * self.num_splits, H, W), running_mean_split, running_var_split, self.weight.repeat(self.num_splits), self.bias.repeat(self.num_splits), True, self.momentum, self.eps).view(N, C, H, W) self.running_mean.data.copy_(running_mean_split.view(self.num_splits, C).mean(dim=0)) self.running_var.data.copy_(running_var_split.view(self.num_splits, C).mean(dim=0)) return outcome else: return nn.functional.batch_norm( input, self.running_mean, self.running_var, self.weight, self.bias, False, self.momentum, self.eps) class ModelBase(nn.Module): """ Common CIFAR ResNet recipe. Comparing with ImageNet ResNet recipe, it: (i) replaces conv1 with kernel=3, str=1 (ii) removes pool1 """ def init(self, feature_dim=128, arch=None, bn_splits=16): super(ModelBase, self).init() # use split batchnorm norm_layer = partial(SplitBatchNorm, num_splits=bn_splits) if bn_splits > 1 else nn.BatchNorm2d resnet_arch = getattr(resnet, arch) net = resnet_arch(num_classes=feature_dim, norm_layer=norm_layer) self.net = [] for name, module in net.named_children(): if name == 'conv1': module = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) if isinstance(module, nn.MaxPool2d): continue if isinstance(module, nn.Linear): self.net.append(nn.Flatten(1)) self.net.append(module) self.net = nn.Sequential(self.net) def forward(self, x): x = self.net(x) # note: not normalized here return x # 定义MOCO class ModelMoCo(nn.Module): def init(self, dim=128, K=4096, m=0.99, T=0.1, arch='resnet18', bn_splits=8, symmetric=True): super(ModelMoCo, self).init() self.K = K self.m = m self.T = T self.symmetric = symmetric # create the encoders self.encoder_q = ModelBase(feature_dim=dim, arch=arch, bn_splits=bn_splits) self.encoder_k = ModelBase(feature_dim=dim, arch=arch, bn_splits=bn_splits) for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()): param_k.data.copy_(param_q.data) # initialize param_k.requires_grad = False # not update by gradient 不参与训练 # create the queue self.register_buffer("queue", torch.randn(dim, K)) self.queue = nn.functional.normalize(self.queue, dim=0) self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long)) @torch.no_grad() def _momentum_update_key_encoder(self): # 动量更新encoder_k """ Momentum update of the key encoder """ for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()): param_k.data = param_k.data self.m + param_q.data * (1. - self.m) @torch.no_grad() def _dequeue_and_enqueue(self, keys): # 出队与入队 batch_size = keys.shape[0] ptr = int(self.queue_ptr) assert self.K % batch_size == 0 # for simplicity # replace the keys at ptr (dequeue and enqueue) self.queue[:, ptr:ptr + batch_size] = keys.t() # transpose ptr = (ptr + batch_size) % self.K # move pointer self.queue_ptr[0] = ptr @torch.no_grad() def _batch_shuffle_single_gpu(self, x): """ Batch shuffle, for making use of BatchNorm. """ # random shuffle index idx_shuffle = torch.randperm(x.shape[0]).cuda() # index for restoring idx_unshuffle = torch.argsort(idx_shuffle) return x[idx_shuffle], idx_unshuffle @torch.no_grad() def _batch_unshuffle_single_gpu(self, x, idx_unshuffle): """ Undo batch shuffle. """ return x[idx_unshuffle] def contrastive_loss(self, im_q, im_k): # compute query features q = self.encoder_q(im_q) # queries: NxC q = nn.functional.normalize(q, dim=1) # already normalized # compute key features with torch.no_grad(): # no gradient to keys # shuffle for making use of BN im_k_, idx_unshuffle = self._batch_shuffle_single_gpu(im_k) k = self.encoder_k(im_k_) # keys: NxC k = nn.functional.normalize(k, dim=1) # already normalized # undo shuffle k = self._batch_unshuffle_single_gpu(k, idx_unshuffle) # compute logits # Einstein sum is more intuitive # positive logits: Nx1 l_pos = torch.einsum('nc,nc->n', [q, k]).unsqueeze(-1) # negative logits: NxK l_neg = torch.einsum('nc,ck->nk', [q, self.queue.clone().detach()]) # logits: Nx(1+K) logits = torch.cat([l_pos, l_neg], dim=1) # apply temperature logits /= self.T # labels: positive key indicators labels = torch.zeros(logits.shape[0], dtype=torch.long).cuda() loss = nn.CrossEntropyLoss().cuda()(logits, labels) # 交叉熵损失 return loss, q, k def forward(self, im1, im2): """ Input: im_q: a batch of query images im_k: a batch of key images Output: loss """ # update the key encoder with torch.no_grad(): # no gradient to keys self._momentum_update_key_encoder() # compute loss if self.symmetric: # asymmetric loss loss_12, q1, k2 = self.contrastive_loss(im1, im2) loss_21, q2, k1 = self.contrastive_loss(im2, im1) loss = loss_12 + loss_21 k = torch.cat([k1, k2], dim=0) else: # asymmetric loss loss, q, k = self.contrastive_loss(im1, im2) self._dequeue_and_enqueue(k) return loss # create model moco_model = ModelMoCo( dim=args.moco_dim, K=args.moco_k, m=args.moco_m, T=args.moco_t, arch=args.arch, bn_splits=args.bn_splits, symmetric=args.symmetric, ).cuda() # print(moco_model.encoder_q) moco_model_1 = ModelMoCo( dim=args.moco_dim, K=args.moco_k, m=args.moco_m, T=args.moco_t, arch=args.arch, bn_splits=args.bn_splits, symmetric=args.symmetric, ).cuda() # print(moco_model_1.encoder_q) """ CIFAR10 Dataset. """ from torch.cuda import amp scaler = amp.GradScaler(enabled=cuda) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # train for one epoch # def moco_train(net, net_1, data_loader, train_optimizer, epoch, args): # net.train() # adjust_learning_rate(moco_optimizer, epoch, args) # total_loss, total_num, train_bar = 0.0, 0, tqdm(data_loader) # loss_add = 0.0 # for im_1, im_2 in train_bar: # im_1, im_2 = im_1.cuda(non_blocking=True), im_2.cuda(non_blocking=True) # loss = net(im_1, im_2) # 原始图像对比损失梯度清零—>梯度回传—>梯度跟新 # # lossT = loss # 只使用原始对比损失 # # train_optimizer.zero_grad() # # lossT.backward() # # train_optimizer.step() # # loss_add += lossT.item() # # total_num += data_loader.batch_size # # total_loss += loss.item() * data_loader.batch_size # # train_bar.set_description( # # 'Train Epoch: [{}/{}], lr: {:.6f}, Loss: {:.4f}'.format( # # epoch, args.epochs, # # train_optimizer.param_groups[0]['lr'], # # loss_add / total_num # # ) # # ) # #傅里叶变换处理流程 # #im_3 = torch.rfft(im_1, 3, onesided=False, normalized=True)[:, :, :, :, 0] # fft_output = torch.fft.fftn(im_1, dim=(-3, -2, -1), norm="ortho")#转换为频域 # real_imag = torch.view_as_real(fft_output)#分解实部虚部 # im_3 = real_imag[..., 0]#提取频域实部作为新视图 # #该处理实现了频域空间的增强，与空间域增强形成了互补 # #im_4 = torch.rfft(im_2, 3, onesided=False, normalized=True)[:, :, :, :, 0] # fft_output = torch.fft.fftn(im_2, dim=(-3, -2, -1), norm="ortho") # real_imag = torch.view_as_real(fft_output) # im_4 = real_imag[..., 0] # loss_1 = net_1(im_3, im_4)#频域特征对比损失 # lossT = 0.8loss + 0.2loss_1#多模态损失对比融合 # train_optimizer.zero_grad() # lossT.backward() # train_optimizer.step() # loss_add += lossT # total_num += data_loader.batch_size # total_loss += loss.item() * data_loader.batch_size # # train_bar.set_description( # # 'Train Epoch: [{}/{}], lr: {:.6f}, Loss: {:.4f}'.format(epoch, args.epochs, moco_optimizer.param_groups[0]['lr'], # # loss_add / total_num)) # return (loss_add / total_num).cpu().item() # yolov5需要的损失 def moco_train(net, net_1, data_loader, train_optimizer, epoch, args): net.train() adjust_learning_rate(train_optimizer, epoch, args) total_loss, total_num = 0.0, 0 train_bar = tqdm(data_loader) for im_1, im_2, im_3, im_4 in train_bar: # 接收4组视图 im_1, im_2 = im_1.cuda(), im_2.cuda() im_3, im_4 = im_3.cuda(), im_4.cuda() # 原始空间域对比损失 loss_orig = net(im_1, im_2) # 退化增强图像的空间域对比损失 loss_degraded = net(im_3, im_4) # 频域处理（对退化增强后的图像） fft_3 = torch.fft.fftn(im_3, dim=(-3, -2, -1), norm="ortho") fft_3 = torch.view_as_real(fft_3)[..., 0] # 取实部 fft_4 = torch.fft.fftn(im_4, dim=(-3, -2, -1), norm="ortho") fft_4 = torch.view_as_real(fft_4)[..., 0] # 频域对比损失 loss_freq = net_1(fft_3, fft_4) # 多模态损失融合 loss = 0.6 * loss_orig + 0.3 * loss_degraded + 0.1 * loss_freq # 反向传播 train_optimizer.zero_grad() loss.backward() train_optimizer.step() # 记录损失 total_num += data_loader.batch_size total_loss += loss.item() # train_bar.set_description(f'Epoch: [{epoch}/{args.epochs}] Loss: {total_loss/total_num:.4f}') return total_loss / total_num # lr scheduler for training def adjust_learning_rate(optimizer, epoch, args): # 学习率衰减 """Decay the learning rate based on schedule""" lr = args.lr if args.cos: # cosine lr schedule lr = 0.5 (1. + math.cos(math.pi * epoch / args.epochs)) else: # stepwise lr schedule for milestone in args.schedule: lr = 0.1 if epoch >= milestone else 1. for param_group in optimizer.param_groups: param_group['lr'] = lr # test using a knn monitor def test(net, memory_data_loader, test_data_loader, epoch, args): net.eval() classes = len(memory_data_loader.dataset.classes) total_top1, total_top5, total_num, feature_bank = 0.0, 0.0, 0, [] with torch.no_grad(): # generate feature bank for data, target in tqdm(memory_data_loader, desc='Feature extracting'): feature = net(data.cuda(non_blocking=True)) feature = F.normalize(feature, dim=1) feature_bank.append(feature) # [D, N] feature_bank = torch.cat(feature_bank, dim=0).t().contiguous() # [N] feature_labels = torch.tensor(memory_data_loader.dataset.targets, device=feature_bank.device) # loop test data_processing to predict the label by weighted knn search test_bar = tqdm(test_data_loader) for data, target in test_bar: data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True) feature = net(data) feature = F.normalize(feature, dim=1) pred_labels = knn_predict(feature, feature_bank, feature_labels, classes, args.knn_k, args.knn_t) total_num += data.size(0) total_top1 += (pred_labels[:, 0] == target).float().sum().item() test_bar.set_description( 'Test Epoch: [{}/{}] Acc@1:{:.2f}%'.format(epoch, args.epochs, total_top1 / total_num 100)) return total_top1 / total_num * 100 # knn monitor as in InstDisc https://arxiv.org/abs/1805.01978 # implementation follows http://github.com/zhirongw/lemniscate.pytorch and https://github.com/leftthomas/SimCLR def knn_predict(feature, feature_bank, feature_labels, classes, knn_k, knn_t): # compute cos similarity between each feature vector and feature bank ---> [B, N] sim_matrix = torch.mm(feature, feature_bank) # [B, K] sim_weight, sim_indices = sim_matrix.topk(k=knn_k, dim=-1) # [B, K] sim_labels = torch.gather(feature_labels.expand(feature.size(0), -1), dim=-1, index=sim_indices) sim_weight = (sim_weight / knn_t).exp() # counts for each class one_hot_label = torch.zeros(feature.size(0) * knn_k, classes, device=sim_labels.device) # [BK, C] one_hot_label = one_hot_label.scatter(dim=-1, index=sim_labels.view(-1, 1), value=1.0) # weighted score ---> [B, C] pred_scores = torch.sum(one_hot_label.view(feature.size(0), -1, classes) sim_weight.unsqueeze(dim=-1), dim=1) pred_labels = pred_scores.argsort(dim=-1, descending=True) return pred_labels # 开始训练 # define optimizer moco_optimizer = torch.optim.SGD(moco_model.parameters(), lr=args.lr, weight_decay=args.wd, momentum=0.9) 上述问题怎么修改？

queue = torch.cat([queue, k.T], dim=1) # k.T: [dim, batch_size] queue = queue[:, batch_size:] # 保持队列大小K # 更新动量编码器参数 update_momentum_encoder(q_encoder, k_encoder, momentum=m) # ...

import cv2 import numpy as np import torch from piq import MS_SSIM from piq import vif_p from scipy import ndimage from scipy.stats import pearsonr import pywt def calculate_vif(ir_image, visible_image, fused_image): """ 计算视觉信息保真度(VIF) $$VIF = \frac{\sum_{j} I(\vec{C}^N_j; \vec{F}_j | s_j)}{\sum_{j} I(\vec{C}^N_j; \vec{E}_j | s_j)}$$ """ # 转换为PyTorch tensor ir_tensor = torch.tensor(ir_image).unsqueeze(0).unsqueeze(0).float() fused_tensor = torch.tensor(fused_image).unsqueeze(0).unsqueeze(0).float() # 使用piq库计算VIF vif_score = vif_p(ir_tensor, fused_tensor, data_range=1.0).item() return vif_score def calculate_scd(ir_image, visible_image, fused_image): """ 计算空间内容差异(SCD) $$SCD = corr(\nabla F, \nabla I) - corr(\nabla F, \nabla V)$$ """ # 计算梯度 def compute_gradient(img): dx = ndimage.sobel(img, axis=0, mode='constant') dy = ndimage.sobel(img, axis=1, mode='constant') return np.sqrt(dx 2 + dy 2) grad_ir = compute_gradient(ir_image) grad_vis = compute_gradient(visible_image) grad_fused = compute_gradient(fused_image) # 计算相关系数 corr_ir = pearsonr(grad_fused.flatten(), grad_ir.flatten())[0] corr_vis = pearsonr(grad_fused.flatten(), grad_vis.flatten())[0] scd = corr_ir - corr_vis return abs(scd) # 取绝对值表示差异程度 def calculate_qabf(ir_image, visible_image, fused_image): """ 计算基于梯度的指标(QAB/F) $$Q^{AB/F} = \frac{\sum_{i=1}^{W} \sum_{j=1}^{H} Q(i,j)}{\sum_{i=1}^{W} \sum_{j=1}^{H} \max(Q^A(i,j), Q^B(i,j))}$$ """ # 计算梯度幅值和方向 def compute_gradients(img): gx = cv2.Sobel(img, cv2.CV_64F, 1, 0, ksize=3) gy = cv2.Sobel(img, cv2.CV_64F, 0, 1, ksize=3) magnitude = np.sqrt(gx 2 + gy 2) angle = np.arctan2(gy, gx) return magnitude, angle mag_ir, ang_ir = compute_gradients(ir_image) mag_vis, ang_vis = compute_gradients(visible_image) mag_fused, ang_fused = compute_gradients(fused_image) # 计算相似度 G_A = mag_ir G_B = mag_vis G_F = mag_fused Q_g = (2 * G_A * G_B) / (G_A 2 + G_B 2 + 1e-8) Q_a = 1 - np.abs(ang_ir - ang_vis) / (np.pi / 2) Q_AF = (Q_g * G_A * G_F + Q_a * (1 - G_A)) / (G_A + G_F + 1e-8) Q_BF = (Q_g * G_B * G_F + Q_a * (1 - G_B)) / (G_B + G_F + 1e-8) Q_ABF = (Q_AF + Q_BF) / 2 numerator = np.sum(Q_ABF) denominator = np.sum(np.maximum(mag_ir, mag_vis)) return numerator / (denominator + 1e-8) def calculate_ms_ssim(ir_image, visible_image, fused_image): """ 计算多尺度结构相似性(MS-SSIM) $$MS\text{-}SSIM(x,y) = [l_M(x,y)]^{\alpha_M} \prod_{j=1}^{M} [cs_j(x,y)]^{\beta_j}$$ """ # 转换为PyTorch tensor vis_tensor = torch.tensor(visible_image).unsqueeze(0).unsqueeze(0).float() fused_tensor = torch.tensor(fused_image).unsqueeze(0).unsqueeze(0).float() # 使用piq库计算MS-SSIM ms_ssim_metric = MS_SSIM(data_range=1.0) ms_ssim_score = ms_ssim_metric(vis_tensor, fused_tensor).item() return ms_ssim_score def calculate_fmi(ir_image, visible_image, fused_image): """ 计算特征互信息(FMI) $$FMI = \frac{1}{|P|} \sum_{p \in P} MI(\vec{F}_p; \vec{V}_p) + MI(\vec{F}_p; \vec{I}_p)$$ """ # 小波变换提取特征 def wavelet_features(img): coeffs = pywt.dwt2(img, 'haar') cA, (cH, cV, cD) = coeffs return np.vstack([cA.ravel(), cH.ravel(), cV.ravel(), cD.ravel()]) features_ir = wavelet_features(ir_image) features_vis = wavelet_features(visible_image) features_fused = wavelet_features(fused_image) # 计算互信息 def mutual_info(f1, f2): hist_2d, _, _ = np.histogram2d(f1, f2, bins=20) pxy = hist_2d / np.sum(hist_2d) px = np.sum(pxy, axis=1) py = np.sum(pxy, axis=0) px_py = px[:, None] * py[None, :] nzs = pxy > 0 return np.sum(pxy[nzs] * np.log(pxy[nzs] / (px_py[nzs] + 1e-8))) mi_ir = mutual_info(features_fused[0], features_ir[0]) mi_vis = mutual_info(features_fused[0], features_vis[0]) return (mi_ir + mi_vis) / 2 def evaluate_fusion(ir_path, vis_path, fused_path): """ 完整评估流程 """ # 读取图像并归一化 ir_image = cv2.imread(ir_path, cv2.IMREAD_GRAYSCALE).astype(np.float32) / 255.0 visible_image = cv2.imread(vis_path, cv2.IMREAD_GRAYSCALE).astype(np.float32) / 255.0 fused_image = cv2.imread(fused_path, cv2.IMREAD_GRAYSCALE).astype(np.float32) / 255.0 # 确保图像尺寸一致 assert ir_image.shape == visible_image.shape == fused_image.shape # 计算所有指标 results = { 'VIF': calculate_vif(ir_image, visible_image, fused_image), 'SCD': calculate_scd(ir_image, visible_image, fused_image), 'QAB/F': calculate_qabf(ir_image, visible_image, fused_image), 'MS-SSIM': calculate_ms_ssim(ir_image, visible_image, fused_image), 'FMI': calculate_fmi(ir_image, visible_image, fused_image) } return results # 使用示例 if name == "main": # 替换为实际图像路径 CT_DIR = "dataset/CT_MRI_Original/CT/CT_1_ind2rgb/001.png" MRI_DIR = "dataset/CT_MRI_Original/MRI/MRI_1_ind2rgb/001.png" FUSED_DIR = "outputs_ct_mri/CT_1_ind2rgb/001.png" metrics = evaluate_fusion(CT_DIR, MRI_DIR, FUSED_DIR) print("融合性能评估结果:") for metric, value in metrics.items(): print(f"{metric}: {value:.4f}") 修改直至完善，给我完整代码

由于piq库的vif_p和MS_SSIM要求输入为四维张量（batch, channel, height, width），我们的代码已经做了相应处理。下面我们将修改代码：首先，重新实现QAB/F：标准QAB/F实现（参考：...

class MyDataSet(Dataset): """自定义数据集""" def init(self, images_path: list, images_class: list, transform=None): self.images_path = images_path self.images_class = images_class self.transform = transform def len(self): return len(self.images_path) def getitem(self, item): img = Image.open(self.images_path[item]) # RGB为彩色图片，L为灰度图片 if img.mode != 'RGB': raise ValueError("image: {} isn't RGB mode.".format(self.images_path[item])) label = self.images_class[item] if self.transform is not None: img = self.transform(img) print(label) return img, label @staticmethod def collate_fn(batch): # 官方实现的default_collate可以参考 # https://github.com/pytorch/pytorch/blob/67b7e751e6b5931a9f45274653f4f653a4e6cdf6/torch/utils/data/_utils/collate.py images, labels = tuple(zip(*batch)) images = torch.stack(images, dim=0) labels = torch.as_tensor(np.array(labels[:, 0])) return images, labels

最后，代码使用 NumPy 的 array 函数将标签列表转换为 NumPy 数组，然后使用 [:, 0] 语法取出了每个元组中的第一个标签，最后使用 torch.as_tensor 函数将其转换为 PyTorch 张量。这个自定义数据集类的实现...

class BboxLoss(nn.Module): def init(self, reg_max=16): “”“Initialize the BboxLoss module with regularization maximum and DFL settings.”“” super().init() self.dfl_loss = DFLoss(reg_max) if reg_max > 1 else None self.focal_loss = FocalLoss(alpha=0.25, gamma=2.0) # 新增Focal Loss初始化 def forward(self, pred_dist, pred_bboxes, pred_scores, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask): """计算三类损失：IoU Loss + DFL Loss + Focal Loss""" # IoU损失计算（保持原有逻辑） weight = target_scores.sum(-1)[fg_mask].unsqueeze(-1) iou = bbox_iou(pred_bboxes[fg_mask], target_bboxes[fg_mask], xywh=False, alpha_iou=True, alpha=3.0) if isinstance(iou, tuple): # 处理多返回值情况 loss_iou = ((1.0 - iou[0]) * iou[1].detach() * weight).sum() / target_scores_sum else: loss_iou = ((1.0 - iou) * weight).sum() / target_scores_sum # DFL损失计算（保持原有逻辑） if self.dfl_loss: target_ltrb = bbox2dist(anchor_points, target_bboxes, self.dfl_loss.reg_max - 1) loss_dfl = self.dfl_loss(pred_dist[fg_mask].view(-1, self.dfl_loss.reg_max), target_ltrb[fg_mask]) * weight loss_dfl = loss_dfl.sum() / target_scores_sum else: loss_dfl = torch.tensor(0.0).to(pred_dist.device) # 新增Focal Loss计算（处理类别不平衡） # pred_scores: 模型输出的分类预测值 [batch, num_anchors, num_classes] # target_scores: 实际分类标签 [batch, num_anchors, num_classes] loss_focal = self.focal_loss( pred_scores[fg_mask], # 只计算前景样本的分类损失 target_scores[fg_mask].to(pred_scores.dtype) # 确保数据类型一致 ).sum() / target_scores_sum # 与IoU/DFL损失保持归一化方式一致 return loss_iou, loss_dfl, loss_focal # 返回三类损失在上述代码中添加了Focal Loss，现在我将提供了现在的bbox_iou函数，我希望修改添加Focal Loss后的bbox_iou函数 def bbox_iou(box1, box2, xywh=True, GIoU=False, DIoU=False, CIoU=False,alpha_iou=False,alpha=3.0, eps=1e-7): if xywh: # transform from xywh to xyxy (x1, y1, w1, h1), (x2, y2, w2, h2) = box1.chunk(4, -1), box2.chunk(4, -1) w1_, h1_, w2_, h2_ = w1 / 2, h1 / 2, w2 / 2, h2 / 2 b1_x1, b1_x2, b1_y1, b1_y2 = x1 - w1_, x1 + w1_, y1 - h1_, y1 + h1_ b2_x1, b2_x2, b2_y1, b2_y2 = x2 - w2_, x2 + w2_, y2 - h2_, y2 + h2_ else: # x1, y1, x2, y2 = box1 b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1) b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1) w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps # 交集区域计算 inter = (b1_x2.minimum(b2_x2) - b1_x1.maximum(b2_x1)).clamp_(0) * ( b1_y2.minimum(b2_y2) - b1_y1.maximum(b2_y1) ).clamp_(0) # Union Area union = w1 * h1 + w2 * h2 - inter + eps # IoU iou = inter / union if CIoU or DIoU or GIoU: cw = b1_x2.maximum(b2_x2) - b1_x1.minimum(b2_x1) # convex (smallest enclosing box) width ch = b1_y2.maximum(b2_y2) - b1_y1.minimum(b2_y1) # convex height if CIoU or DIoU: # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1 c2 = cw.pow(2) + ch.pow(2) + eps # convex diagonal squared rho2 = ( (b2_x1 + b2_x2 - b1_x1 - b1_x2).pow(2) + (b2_y1 + b2_y2 - b1_y1 - b1_y2).pow(2) ) / 4 # center dist2 if CIoU: # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47 v = (4 / math.pi2) * ((w2 / h2).atan() - (w1 / h1).atan()).pow(2) with torch.no_grad(): alpha = v / (v - iou + (1 + eps)) return iou - (rho2 / c2 + v * alpha) # CIoU return iou - rho2 / c2 # DIoU c_area = cw * ch + eps # convex area return iou - (c_area - union) / c_area # GIoU https://arxiv.org/pdf/1902.09630.pdf # 添加Alpha-IoU计算 if alpha_iou: alpha = alpha if alpha > 0 else 3.0 # 默认α=3 alpha_iou = 1 - ((1 - iou) ** alpha) # Alpha-IoU公式 return alpha_iou return iou # IoU

嗯，用户希望了解在BboxLoss类中添加Focal Loss后如何修改bbox_iou函数。不过看起来用户提供的bbox_iou函数已经包含了Alpha-IoU的计算，这可能与Focal Loss一起使用来优化边界框回归。需要先理清用户的需求。...

class BboxLoss(nn.Module): def init(self, reg_max=16): “”“Initialize the BboxLoss module with regularization maximum and DFL settings.”“” super().init() self.dfl_loss = DFLoss(reg_max) if reg_max > 1 else None self.focal_loss = FocalLoss(alpha=0.25, gamma=2.0) # 新增Focal Loss初始化 def forward(self, pred_dist, pred_bboxes, pred_scores, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask): “”“计算三类损失：IoU Loss + DFL Loss + Focal Loss”“” # IoU损失计算（保持原有逻辑） weight = target_scores.sum(-1)[fg_mask].unsqueeze(-1) iou = bbox_iou(pred_bboxes[fg_mask], target_bboxes[fg_mask], xywh=False, alpha_iou=True, alpha=3.0) if isinstance(iou, tuple): # 处理多返回值情况 loss_iou = ((1.0 - iou[0]) * iou[1].detach() * weight).sum() / target_scores_sum else: loss_iou = ((1.0 - iou) * weight).sum() / target_scores_sum # DFL损失计算（保持原有逻辑） if self.dfl_loss: target_ltrb = bbox2dist(anchor_points, target_bboxes, self.dfl_loss.reg_max - 1) loss_dfl = self.dfl_loss(pred_dist[fg_mask].view(-1, self.dfl_loss.reg_max), target_ltrb[fg_mask]) * weight loss_dfl = loss_dfl.sum() / target_scores_sum else: loss_dfl = torch.tensor(0.0).to(pred_dist.device) # 新增Focal Loss计算（处理类别不平衡） # pred_scores: 模型输出的分类预测值 [batch, num_anchors, num_classes] # target_scores: 实际分类标签 [batch, num_anchors, num_classes] loss_focal = self.focal_loss( pred_scores[fg_mask], # 只计算前景样本的分类损失 target_scores[fg_mask].to(pred_scores.dtype) # 确保数据类型一致 ).sum() / target_scores_sum # 与IoU/DFL损失保持归一化方式一致 return loss_iou, loss_dfl, loss_focal # 返回三类损失 def bbox_iou(box1, box2, xywh=True, GIoU=False, DIoU=False, CIoU=False,alpha_iou=False,alpha=3.0, eps=1e-7): if xywh: # transform from xywh to xyxy (x1, y1, w1, h1), (x2, y2, w2, h2) = box1.chunk(4, -1), box2.chunk(4, -1) w1_, h1_, w2_, h2_ = w1 / 2, h1 / 2, w2 / 2, h2 / 2 b1_x1, b1_x2, b1_y1, b1_y2 = x1 - w1_, x1 + w1_, y1 - h1_, y1 + h1_ b2_x1, b2_x2, b2_y1, b2_y2 = x2 - w2_, x2 + w2_, y2 - h2_, y2 + h2_ else: # x1, y1, x2, y2 = box1 b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1) b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1) w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps # 交集区域计算 inter = (b1_x2.minimum(b2_x2) - b1_x1.maximum(b2_x1)).clamp_(0) * ( b1_y2.minimum(b2_y2) - b1_y1.maximum(b2_y1) ).clamp_(0) # Union Area union = w1 * h1 + w2 * h2 - inter + eps # IoU iou = inter / union if CIoU or DIoU or GIoU: cw = b1_x2.maximum(b2_x2) - b1_x1.minimum(b2_x1) # convex (smallest enclosing box) width ch = b1_y2.maximum(b2_y2) - b1_y1.minimum(b2_y1) # convex height if CIoU or DIoU: # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1 c2 = cw.pow(2) + ch.pow(2) + eps # convex diagonal squared rho2 = ( (b2_x1 + b2_x2 - b1_x1 - b1_x2).pow(2) + (b2_y1 + b2_y2 - b1_y1 - b1_y2).pow(2) ) / 4 # center dist2 if CIoU: # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47 v = (4 / math.pi2) * ((w2 / h2).atan() - (w1 / h1).atan()).pow(2) with torch.no_grad(): alpha = v / (v - iou + (1 + eps)) return iou - (rho2 / c2 + v * alpha) # CIoU return iou - rho2 / c2 # DIoU c_area = cw * ch + eps # convex area return iou - (c_area - union) / c_area # GIoU https://arxiv.org/pdf/1902.09630.pdf # 添加Alpha-IoU计算 if alpha_iou: alpha = alpha if alpha > 0 else 3.0 # 默认α=3 alpha_iou = 1 - ((1 - iou) ** alpha) # Alpha-IoU公式 return alpha_iou return iou # IoU 在上述代码中添加了Focal Loss，现在我将提供了现在的bbox_iou函数，我希望生成添加Focal Loss后的bbox_iou函数的完整代码

with torch.no_grad(): alpha_ciou = v / (v - iou + (1 + eps)) iou -= (rho2 / c2 + v * alpha_ciou) else: # DIoU iou -= rho2 / c2 else: # GIoU https://arxiv.org/pdf/1902.09630.pdf c_area = cw * ch...

If the pretrained HGNetV2 can't be downloaded automatically. Please check your network connection. Please check your network connection. Or download the model manually from https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B0_stage1.pth to weight/hgnetv2/. Loaded stage1 B0 HGNetV2 from URL. /root/D-FINE-master/src/nn/backbone/hgnetv2.py:534: FutureWarning: You are using torch.load with weights_only=False (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for weights_only will be flipped to True. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via torch.serialization.add_safe_globals. We recommend you start setting weights_only=True for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. state = torch.load(model_path, map_location="cpu") Loaded stage1 B0 HGNetV2 from URL. Initial lr: [0.0001, 0.0001, 0.0002, 0.0002] building train_dataloader with batch_size=8... [rank0]: Traceback (most recent call last): [rank0]: File "train.py", line 113, in <module> [rank0]: main(args) [rank0]: File "train.py", line 75, in main [rank0]: solver.fit() [rank0]: File "/root/D-FINE-master/src/solver/det_solver.py", line 22, in fit [rank0]: self.train() [rank0]: File "/root/D-FINE-master/src/solver/_solver.py", line 172, in train [rank0]: self.cfg.train_dataloader, shuffle=self.cfg.train_dataloader.shuffle [rank0]: File "/root/D-FINE-master/src/core/yaml_config.py", line 79, in train_dataloader [rank0]: self._train_dataloader = self.build_dataloader("train_dataloader") [rank0]: File "/root/D-FINE-master/src/core/yaml_config.py", line 185, in build_dataloader [rank0]: loader = create(name, global_cfg, batch_size=bs) [rank0]: File "/root/D-FINE-master/src/core/workspace.py", line 119, in create [rank0]: module = getattr(cfg["_pymodule"], name) [rank0]: KeyError: '_pymodule' E0716 16:37:12.999565 140709997880512 torch/distributed/elastic/multiprocessing/api.py:833] failed (exitcode: 1) local_rank: 0 (pid: 797171) of binary: /root/miniconda3/bin/python Traceback (most recent call last): File "/root/miniconda3/bin/torchrun", line 8, in <module> sys.exit(main()) File "/root/miniconda3/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 348, in wrapper return f(*args, **kwargs) File "/root/miniconda3/lib/python3.8/site-packages/torch/distributed/run.py", line 901, in main run(args) File "/root/miniconda3/lib/python3.8/site-packages/torch/distributed/run.py", line 892, in run elastic_launch( File "/root/miniconda3/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 133, in call return launch_agent(self._config, self._entrypoint, list(args)) File "/root/miniconda3/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError: ============================================================ train.py FAILED ------------------------------------------------------------ Failures: <NO_OTHER_FAILURES> ------------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2025-07-16_16:37:12 host : autodl-container-9efe41855c-0b570216 rank : 0 (local_rank: 0) exitcode : 1 (pid: 797171) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ============================================================

from torch.utils.data import Dataset class DetDataset(Dataset): def __init__(self, root, transform=None): self.root = root # 初始化代码... def __len__(self): return ... # 返回数据集长度 def...

Traceback (most recent call last): File "D:/YOLOV8/ultralytics-main/datasets/demo_2.py", line 10, in <module> results = model.train(data='data.yaml', epochs=100,workers=0,batch=16,imgsz=640,device=0, lr0= 0.001,weight_decay=0.05,mixup=0.2, hsv_h=0.015) File "D:\YOLOV8\ultralytics-main\ultralytics\engine\model.py", line 803, in train self.trainer.train() File "D:\YOLOV8\ultralytics-main\ultralytics\engine\trainer.py", line 207, in train self._do_train(world_size) File "D:\YOLOV8\ultralytics-main\ultralytics\engine\trainer.py", line 385, in _do_train self.loss, self.loss_items = self.model(batch) File "D:\anaconda\envs\yolo8\lib\site-packages\torch\nn\modules\module.py", line 1102, in _call_impl return forward_call(*input, **kwargs) File "D:\YOLOV8\ultralytics-main\ultralytics\nn\tasks.py", line 111, in forward return self.loss(x, *args, **kwargs) File "D:\YOLOV8\ultralytics-main\ultralytics\nn\tasks.py", line 293, in loss return self.criterion(preds, batch) File "D:\YOLOV8\ultralytics-main\ultralytics\utils\loss.py", line 268, in call loss[0], loss[2] = self.bbox_loss( File "D:\anaconda\envs\yolo8\lib\site-packages\torch\nn\modules\module.py", line 1102, in _call_impl return forward_call(*input, **kwargs) File "D:\YOLOV8\ultralytics-main\ultralytics\utils\loss.py", line 128, in forward loss_cls = self.focal_loss(pred_cls, target_cls).sum() / target_scores_sum File "D:\anaconda\envs\yolo8\lib\site-packages\torch\nn\modules\module.py", line 1102, in _call_impl return forward_call(*input, **kwargs) File "D:\YOLOV8\ultralytics-main\ultralytics\utils\loss.py", line 50, in forward loss = F.binary_cross_entropy_with_logits(pred, label, reduction="none") File "D:\anaconda\envs\yolo8\lib\site-packages\torch\nn\functional.py", line 2980, in binary_cross_entropy_with_logits raise ValueError("Target size ({}) must be the same as input size ({})".format(target.size(), input.size())) ValueError: Target size (torch.Size([793, 5])) must be the same as input size (torch.Size([793, 64]))帮我修改class FocalLoss(nn.Module): """Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5).""" def init(self, alpha=3.0, gamma=2): # 添加参数 super().init() self.alpha = alpha self.gamma = gamma @staticmethod def forward(pred, label, gamma=2, alpha=3.0): """Calculates and updates confusion matrix for object detection/classification tasks.""" loss = F.binary_cross_entropy_with_logits(pred, label, reduction="none") # p_t = torch.exp(-loss) # loss = self.alpha (1.000001 - p_t) ** self.gamma # non-zero power for gradient stability # TF implementation https://github.com/tensorflow/addons/blob/v0.7.1/tensorflow_addons/losses/focal_loss.py pred_prob = pred.sigmoid() # prob from logits p_t = label * pred_prob + (1 - label) * (1 - pred_prob) modulating_factor = (1.0 - p_t) ** gamma loss = modulating_factor if alpha > 0: alpha_factor = label alpha + (1 - label) * (1 - alpha) loss *= alpha_factor return loss.mean(1).sum()

错误信息是ValueError，提示目标尺寸和输入尺寸不一致，具体是Target size (torch.Size([793, 5]))和输入尺寸(torch.Size([793, 64]))不匹配。用户提供了他们的FocalLoss类的代码，需要修改这个类来解决问题。首先...

下面有一段Python代码，功能是图像预处理，使用C++和OpenCV实现同样的功能 def preprocess(self, img): """Converts input image to model-compatible data type.""" if not isinstance(img, torch.Tensor): is_legacy_transform = any( self._legacy_transform_name in str(transform) for transform in self.transforms.transforms ) if is_legacy_transform: # to handle legacy transforms img = torch.stack([self.transforms(im) for im in img], dim=0) else: img = torch.stack( [self.transforms(Image.fromarray(cv2.cvtColor(im, cv2.COLOR_BGR2RGB))) for im in img], dim=0 ) img = (img if isinstance(img, torch.Tensor) else torch.from_numpy(img)).to(self.model.device) return img.half() if self.model.fp16 else img.float() # uint8 to fp16/32

Mat blob = dnn::blobFromImages(imgs, 1.0/255.0, Size(640,640), Scalar(), true, false, CV_32F); 2. 集成CUDA加速（需编译支持）： cpp cuda::GpuMat gpu_img; gpu_img.upload(cpu_img); cuda::cvtColor...

C:\Users\86159\anaconda3\envs\pytorch\Lib\site-packages\thinc\shims\pytorch.py:261: FutureWarning: You are using torch.load with weights_only=False (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for weights_only will be flipped to True. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via torch.serialization.add_safe_globals. We recommend you start setting weights_only=True for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. model.load_state_dict(torch.load(filelike, map_location=device)) Traceback (most recent call last): File "C:\Users\86159\Desktop\Learn_pytorch\train.py", line 41, in <module> train() File "C:\Users\86159\Desktop\Learn_pytorch\train.py", line 24, in train for batch in train_loader: File "C:\Users\86159\anaconda3\envs\pytorch\Lib\site-packages\torch\utils\data\dataloader.py", line 701, in next data = self._next_data() ^^^^^^^^^^^^^^^^^ File "C:\Users\86159\anaconda3\envs\pytorch\Lib\site-packages\torch\utils\data\dataloader.py", line 757, in _next_data data = self._dataset_fetcher.fetch(index) # may raise StopIteration ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\86159\anaconda3\envs\pytorch\Lib\site-packages\torch\utils\data\_utils\fetch.py", line 52, in fetch data = [self.dataset[idx] for idx in possibly_batched_index] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\86159\anaconda3\envs\pytorch\Lib\site-packages\torch\utils\data\_uti

from torch.utils.data import DataLoader # 修复 torch.load 警告 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = torch.load('model.pth', map_location=device, weights_only=...

Using device: cuda training 0%| | 0/30000 [00:00<?, ?it/s] 第0轮最终误差0.0025133900344371796 0%| | 1/30000 [00:00<3:34:01, 2.34it/s] c:\Users\cw\Desktop\model_HOT.py:192: FutureWarning: You are using torch.load with weights_only=False (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for weights_only will be flipped to True. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via torch.serialization.add_safe_globals. We recommend you start setting weights_only=True for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. checkpoint = torch.load('model_HOT.pth', map_location=device) D:\Aconda\envs\pytorch\lib\site-packages\torch\functional.py:534: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\aten\src\ATen\native\TensorShape.cpp:3596.) return _VF.meshgrid(tensors, **kwargs) # type: ignore[attr-defined] Traceback (most recent call last): File "c:\Users\cw\Desktop\model_HOT.py", line 215, in <module> u_pred = U(xyt) File "D:\Aconda\envs\pytorch\lib\site-packages\torch\nn\modules\module

x = x.transpose(1, 2).contiguous().view(batch_size, -1) - 或用reshape()替代view()（自动处理连续性）： python x = x.transpose(1, 2).reshape(batch_size, -1) --- #### 其他通用调试...

model.load_state_dict(torch.load(model_path, map_location=device)) model.to(device) model.eval()

预处理时确保每张图像都经过相同的处理，然后使用torch.stack或直接组成一个4维张量（batch_size, channels, height, width）。 ### 相关资源 - [PyTorch官方ResNet文档]...

torch：：fromblob函数作用

{batch_size, channels, height, width} // 典型图像输入形状 - **动态形状支持**： cpp std::vector<int64_t> shape = {3, 224, 224}; tensor = torch::from_blob(data, shape, options); #### 3...

Traceback (most recent call last): File "/home/admin123/code/transformer_xl_encoder_decoder.py", line 210, in <module> output, enc_mems, dec_mems = model(src, tgt) File "/home/admin123/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/home/admin123/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl return forward_call(*args, **kwargs) File "/home/admin123/code/transformer_xl_encoder_decoder.py", line 169, in forward enc_out, new_enc_mems = self.encoder(src_emb, enc_mems) File "/home/admin123/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/home/admin123/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl return forward_call(*args, **kwargs) File "/home/admin123/code/transformer_xl_encoder_decoder.py", line 114, in forward x, new_mem = layer(x, mems[i] if mems is not None else None) File "/home/admin123/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/home/admin123/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl return forward_call(*args, **kwargs) File "/home/admin123/code/transformer_xl_encoder_decoder.py", line 75, in forward attn_out, _ = self.attn( File "/home/admin123/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/home/admin123/anaconda3/envs/llama_factory/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl return forward_call(*args, **kwargs) TypeError: MultiheadAttention.forward() got an unexpected keyword argument 'pos_emb'

batch_size, q_len, _ = q.size() k_len = k.size(1) # 线性投影 q = self.W_q(q).view(batch_size, q_len, self.n_heads, self.d_k).transpose(1, 2) k = self.W_k(k).view(batch_size, k_len, self.n_heads...

torch::Tensor input_tensor = torch::from_blob(input_data, {batch_size, channels, height, width}, torch::kFloat32);中 torch::kFloat32是什么意思

相关推荐

one hot编码：torch.Tensor.scatter_()函数用法详解

pytorch中tensor.expand()和tensor.expand_as()函数详解

fb.resnet.torch-master.zip_ResNet_ResNet深度学习_facebook_torch resn

model.load_state_dict(torch.load(model_path, map_location=device)) model.to(device) model.eval()

torch：：fromblob函数作用

大家在看

mssdk10130048en MsSDK u14

matlab 伪距单点定位

libssl-1_1-x64.zip

Aptra NDC Reference manual

的表中所-数据结构中文版

最新推荐

COMSOL多物理场仿真软件六大模块应用实例详解

基于MPC技术的自动泊车与智能轨迹跟踪系统：智能驾驶的关键组件

Teleport Pro教程：轻松复制网站内容

【跨平台开发者的必读】：解决Qt5Widgetsd.lib目标计算机类型冲突终极指南

普通RNN结构和特点

探讨通用数据连接池的核心机制与应用

【LabVIEW网络通讯终极指南】：7个技巧提升UDP性能和安全性

简要介绍cnn卷积神经网络

基于ASP的深度学习网站导航系统功能详解

【Oracle数据泵进阶技巧】：避免ORA-31634和ORA-31664错误的终极策略