PyTorch torchtune.modules.peft.lora

torchtune
https://docs.pytorch.org/torchtune/main/_modules/index.html
https://docs.pytorch.org/torchtune/0.6/_modules/index.html

1. Source code for torchtune.modules.peft.lora

https://docs.pytorch.org/torchtune/main/_modules/torchtune/modules/peft/lora.html
https://docs.pytorch.org/torchtune/0.6/_modules/torchtune/modules/peft/lora.html

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import math
from enum import Enum
from typing import Optional, Union

import torch
import torch.nn.functional as F

from torch import nn

from torchao.dtypes.nf4tensor import linear_nf4, to_nf4
from torchtune.modules.low_precision import _register_nf4_dispatch_ops  # noqa: F401
from torchtune.modules.peft import AdapterModule


class TrainableParams(Enum):
    FULL = "full"
    LORA = "lora"
    FROZEN = "frozen"


[docs]class LoRALinear(nn.Module, AdapterModule):
    """LoRA linear layer as introduced in `LoRA: Low-Rank Adaptation of Large Language Models <https://arxiv.org/abs/2106.09685>`_.

    LoRA perturbs a given layer via a low-rank approximation where only
    the rank decomposition matrices are trainable. In a linear layer instead of
    :math:`x \\mapsto W_0x` a LoRALinear layer is defined as
    :math:`x \\mapsto W_0x + (\\alpha / r)BAx`, where :math:`r` is the rank of
    the matrices :math:`A` and :math:`B` and :math:`\\alpha` is a scaling factor.
    As in the original implementation, we support dropout before multiplication
    by the low-rank matrices.

    Args:
        in_dim (int): input dimension
        out_dim (int): output dimension
        rank (int): rank of the low-rank approximation
        alpha (float): scaling factor for the low-rank approximation
        dropout (float): dropout probability. Default: 0.0
        use_bias (bool): whether to include bias in the original linear layer.
            Default: False
        quantize_base (bool): Whether to quantize base linear weight or not.
            Default: False
        **quantization_kwargs: Keyword arguments to pass to `to_nf4` when quantizing the base linear weight.
            Examples of valid arguments are `block_size` and `scaler_block_size`, which control the granularity of
            weight quantization and scaler quantization respectively. This is only used if `quantize_base` is True.
            Default None

    Raises:
        ValueError: If ``quantize_base`` is False, but quantization kwargs are provided.
    """

    def __init__(
        self,
        in_dim: int,
        out_dim: int,
        rank: int,
        alpha: float,
        dropout: float = 0.0,
        use_bias: bool = False,
        quantize_base: bool = False,
        **quantization_kwargs,
    ):
        super().__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.rank = rank
        self.alpha = alpha
        self.use_bias = use_bias
        self._quantize_base = quantize_base

        if not self._quantize_base and any([v for v in quantization_kwargs.values()]):
            raise ValueError(
                f"``quantize_base`` is False, but received the following quantization arguments: {quantization_kwargs}"
            )

        # Setup weight and bias
        linear = nn.Linear(in_features=in_dim, out_features=out_dim, bias=self.use_bias)
        # y = x @ W^T + B
        # Input: (N, in_dim) = (N, in_features)
        # Weight: (out_dim, in_dim) = (out_features, in_features)
        # Bias: (out_dim) = (out_features)
        # Output: (N, out_dim) = (N, out_features)

        weight = (
            linear.weight
            if not self._quantize_base
            else to_nf4(linear.weight, **quantization_kwargs)
        )
        # Weight: (out_dim, in_dim) = (out_features, in_features)

        bias = linear.bias if self.use_bias else None
        # Bias: (out_dim) = (out_features)

        # 'self.disabled' is a flag showing whether to turn off LoRA adapters,
        # this can be used in DPO for treating the lora adapters as the policy model
        # and disabling it to treat the base model as the reference model
        self.disabled = False
        self.register_parameter("weight", nn.Parameter(weight))
        self.register_parameter(
            "bias", nn.Parameter(bias) if bias is not None else None
        )
        self.dropout = nn.Dropout(p=dropout) if dropout > 0.0 else nn.Identity()
        self.lora_a = nn.Linear(in_features=in_dim, out_features=rank, bias=False)
        # y = x @ W^T + B
        # Input: (N, in_dim) = (N, in_features)
        # Weight: (rank, in_dim) = (out_features, in_features)
        # Bias: (rank) = (out_features)
        # Output: (N, rank) = (N, out_features)

        self.lora_b = nn.Linear(in_features=rank, out_features=out_dim, bias=False)
        # y = x @ W^T + B
        # Input: (N, rank) = (N, in_features)
        # Weight: (out_dim, rank) = (out_features, in_features)
        # Bias: (out_dim) = (out_features)
        # Output: (N, out_dim) = (N, out_features)

        self.merged = False
        self.initialize_parameters()

[docs]    def to_empty(
        self, *, device: Optional[Union[str, torch.device, int]], recurse: bool = True
    ):
        self.lora_a.to_empty(device=device, recurse=recurse)
        self.lora_b.to_empty(device=device, recurse=recurse)

    def initialize_parameters(self):
        # Initialize as in
        # https://github.com/microsoft/LoRA/blob/4c0333854cb905966f8cc4e9a74068c1e507c7b7/loralib/layers.py#L119
        _lora_a_init_params(self.lora_a)
        _lora_b_init_params(self.lora_b)

[docs]    def adapter_params(self) -> list[str]:
        """
        Return a list of strings corresponding to the names of the ``nn.Parameter`` s in
        the model coming from the adapter.

        For LoRA this means lora_a.weight and lora_b.weight.
        """
        # NOTE: this function has to be updated if the names of "lora_a" and "lora_b"
        # in this module change.
        adapter_params = ["lora_a.weight", "lora_b.weight"]
        return adapter_params

[docs]    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x (torch.Tensor): input tensor with shape ``(..., in_dim)``

        Returns:
            torch.Tensor: output tensor with shape ``(..., out_dim)``

        """
        if self._quantize_base:
            out = linear_nf4(input=x, weight=self.weight)
            if self.use_bias:
                out = out + self.bias
        else:
            out = F.linear(x, self.weight, self.bias)
            # y = x @ W^T + B
            # Input: (N, in_dim) = (N, in_features)
            # Weight: (out_dim, in_dim) = (out_features, in_features)
            # Bias: (out_dim) = (out_features)
            # Output: (N, out_dim) = (N, out_features)

        if self.disabled:
            return out
        lora_out = self.lora_a(self.dropout(x))
        # y = x @ W^T + B
        # Input: (N, in_dim) = (N, in_features)
        # Weight: (rank, in_dim) = (out_features, in_features)
        # Bias: (rank) = (out_features)
        # Output: (N, rank) = (N, out_features)
        
        lora_out = (self.alpha / self.rank) * self.lora_b(lora_out)
        # y = x @ W^T + B
        # Input: (N, rank) = (N, in_features)
        # Weight: (out_dim, rank) = (out_features, in_features)
        # Bias: (out_dim) = (out_features)
        # Output: (N, out_dim) = (N, out_features)
        
        return out + lora_out


class QATLoRALinear(LoRALinear):
    """
    LoRA linear layer with quantization-aware training (QAT) applied to the
    activations and/or weights before the low rank adapters.

    QAT leverages fake quantization to simulate the quantization numerics during
    training without actually casting the data to lower precision. This class
    combines LoRA with QAT to improve the final quantized accuracy during inference
    while reducing the memory required during training.

    Args:
        in_dim (int): input dimension
        out_dim (int): output dimension
        rank (int): rank of the low-rank approximation
        alpha (float): scaling factor for the low-rank approximation
        dropout (float): dropout probability. Default: 0.0
        activation_qat_config (Optional[FakeQuantizeConfig]): config for specifying
            how input activations will be fake quantized, defaults to None
        weight_qat_config (Optional[FakeQuantizeConfig]): config for specifying
            how weights will be fake quantized, defaults to None

    Raises:
        ValueError: If `in_dim` is not divisible by weight `group_size`

    Example usage::

        activation_qat_config = FakeQuantizeConfig(
            dtype=torch.int8,
            granularity="per_token",
            is_symmetric=False,
        )
        weight_qat_config = FakeQuantizeConfig(
            dtype=torch.int4,
            group_size=8,
            is_symmetric=True,
        )
        qat_lora_linear = QATLoRALinear(
            in_dim=512,
            out_dim=1024,
            rank=8,
            alpha=16,
            dropout=0.0,
            activation_qat_config=activation_qat_config,
            weight_qat_config=weight_qat_config,
        )
        qat_lora_linear(torch.randn(512))
    """

    def __init__(
        self,
        in_dim: int,
        out_dim: int,
        rank: int,
        alpha: float,
        dropout: float = 0.0,
        # fake quantize configs
        # TODO: make the types Optional[FakeQuantizeConfig] once we
        # support torchao 0.7+ by default
        activation_qat_config: Optional["FakeQuantizeConfig"] = None,
        weight_qat_config: Optional["FakeQuantizeConfig"] = None,
    ):
        super().__init__(
            in_dim,
            out_dim,
            rank,
            alpha,
            dropout,
            use_bias=False,
            quantize_base=False,
        )

        try:
            from torchao.quantization.qat.api import FakeQuantizeConfig
            from torchao.quantization.qat.fake_quantizer import FakeQuantizer
        except ImportError as err:
            raise ValueError(
                "QATLoRALinear is only compatible with torchao 0.7+"
            ) from err

        # initialize activation fake quantizer
        if activation_qat_config is not None:
            assert isinstance(activation_qat_config, FakeQuantizeConfig)
            self.activation_fake_quantizer = FakeQuantizer(activation_qat_config)
        else:
            self.activation_fake_quantizer = nn.Identity()

        # initialize weight fake quantizer
        if weight_qat_config is not None:
            assert isinstance(weight_qat_config, FakeQuantizeConfig)
            group_size = weight_qat_config.group_size
            if group_size is not None and in_dim % group_size != 0:
                raise ValueError(
                    "in_dim (%s) must be divisible by group_size (%s)"
                    % (in_dim, group_size)
                )
            self.weight_fake_quantizer = FakeQuantizer(weight_qat_config)
        else:
            self.weight_fake_quantizer = nn.Identity()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x (torch.Tensor): input tensor with shape ``(..., in_dim)``

        Returns:
            torch.Tensor: output tensor with shape ``(..., out_dim)``

        """
        _x = self.activation_fake_quantizer(x)
        w = self.weight_fake_quantizer(self.weight)
        out = F.linear(_x, w)
        if self.disabled:
            return out
        lora_out = self.lora_a(self.dropout(x))
        lora_out = (self.alpha / self.rank) * self.lora_b(lora_out)
        return out + lora_out

    @classmethod
    def from_lora_linear(
        cls,
        lora_linear: LoRALinear,
        # TODO: make the types Optional[FakeQuantizeConfig] once we
        # support torchao 0.7+ by default
        activation_qat_config: Optional["FakeQuantizeConfig"] = None,
        weight_qat_config: Optional["FakeQuantizeConfig"] = None,
    ) -> "QATLoRALinear":
        """
        Create a `QATLoRALinear` from an existing `LoRALinear`,
        preserving the weights and adapters.
        """
        if lora_linear.bias is not None:
            ValueError("Bias is not supported in QAT + LoRA yet")
        if lora_linear._quantize_base:
            ValueError("quantize_base is not compatible with QAT + LoRA")
        if isinstance(lora_linear.dropout, nn.Dropout):
            dropout = lora_linear.dropout.p
        else:
            dropout = 0.0
        new_linear = cls(
            lora_linear.in_dim,
            lora_linear.out_dim,
            lora_linear.rank,
            lora_linear.alpha,
            dropout,
            activation_qat_config,
            weight_qat_config,
        )
        # In distributed training, the model may be instantiated
        # on the meta device, in which case there is no need to
        # copy the weights, and doing so will result in an error
        if lora_linear.weight.device != torch.device("meta"):
            new_linear.weight = lora_linear.weight
        if lora_linear.lora_a.weight.device != torch.device("meta"):
            new_linear.lora_a.weight = lora_linear.lora_a.weight
        if lora_linear.lora_b.weight.device != torch.device("meta"):
            new_linear.lora_b.weight = lora_linear.lora_b.weight
        return new_linear


def _lora_a_init_params(x: nn.Linear) -> None:
    """
    Initialize LoRA A weight to Kaiming uniform.
    """
    nn.init.kaiming_uniform_(x.weight, a=math.sqrt(5))


def _lora_b_init_params(x: nn.Linear) -> None:
    """
    Initialize LoRA B weight to zeros.
    """
    nn.init.zeros_(x.weight)

2. LoRALinear

https://docs.pytorch.org/torchtune/main/generated/torchtune.modules.peft.LoRALinear.html

class torchtune.modules.peft.LoRALinear(in_dim: int, out_dim: int, rank: int, alpha: float, dropout: float = 0.0, use_bias: bool = False, quantize_base: bool = False, **quantization_kwargs)

LoRA linear layer as introduced in LoRA: Low-Rank Adaptation of Large Language Models https://arxiv.org/abs/2106.09685.

LoRA perturbs a given layer via a low-rank approximation where only the rank decomposition matrices are trainable. In a linear layer instead of x ↦ W 0 x x \mapsto W_0x xW0x a LoRALinear layer is defined as x ↦ W 0 x + ( α / r ) B A x x \mapsto W_0x + (\alpha / r)BAx xW0x+(α/r)BAx, where r is the rank of the matrices A and B and α \alpha α is a scaling factor.

X = Input: (N, in_dim) = (N, in_features)
W = Weight: (out_dim, in_dim) = (out_features, in_features)
X @ (W)^T: (N, in_dim) @ (in_dim, out_dim) = (N, out_dim)

B Weight: (out_dim, rank) = (out_features, in_features)
A Weight: (rank, in_dim) = (out_features, in_features)
BA: (out_dim, rank) @ (rank, in_dim) = (out_dim, in_dim)
X @ (BA)^T -> Y: (N, in_dim) @ (in_dim, out_dim) = (N, out_dim)

X @ (A)^T -> V: (N, in_dim) @ (in_dim, rank) = (N, rank)
V @ (B)^T -> Y: (N, rank) @ (rank, out_dim) = (N, out_dim)

X @ (A)^T @ (B)^T = X @ (BA)^T -> Y: (N, out_dim)

As in the original implementation, we support dropout before multiplication by the low-rank matrices.

Parameters:

  • in_dim (int): input dimension
  • out_dim (int): output dimension
  • rank (int): rank of the low-rank approximation
  • alpha (float): scaling factor for the low-rank approximation
  • dropout (float): dropout probability. Default: 0.0
  • use_bias (bool): whether to include bias in the original linear layer. Default: False
  • quantize_base (bool): Whether to quantize base linear weight or not. Default: False
  • **quantization_kwargs: Keyword arguments to pass to to_nf4 when quantizing the base linear weight. Examples of valid arguments are block_size and scaler_block_size, which control the granularity of weight quantization and scaler quantization respectively. This is only used if quantize_base is True. Default None

Raises:

ValueError: If quantize_base is False, but quantization kwargs are provided.

2.1. def adapter_params(self) -> list[str]

Return a list of strings corresponding to the names of the nn.Parameter s in the model coming from the adapter.

For LoRA this means lora_a.weight and lora_b.weight.

2.2. def forward(self, x: torch.Tensor) -> torch.Tensor

Parameters:
x (torch.Tensor) - input tensor with shape (..., in_dim)

Returns:
output tensor with shape (..., out_dim)

Return type:
torch.Tensor

2.3. def to_empty(self, *, device: Optional[Union[str, torch.device, int]], recurse: bool = True)

Move the parameters and buffers to the specified device without copying storage.

Parameters:
device (torch.device) - The desired device of the parameters and buffers in this module.

recurse (bool) - Whether parameters and buffers of submodules should be recursively moved to the specified device.

Returns:
self

Return type:
Module

References

[1] Yongqiang Cheng, https://yongqiang.blog.csdn.net/
[2] Python operator - Standard operators as functions, https://yongqiang.blog.csdn.net/article/details/148656267

### 使用 LoRA 对 llama.cpp 进行微调 LoRA(Low-Rank Adaptation)是一种高效的参数高效迁移学习方法,通过仅更新低秩矩阵来实现模型的快速适配。对于 `llama.cpp` 的微调过程,可以结合 LoRA 技术完成特定任务的需求。 以下是基于现有引用内容以及专业知识整理的内容: #### 1. 准备环境 确保安装了必要的依赖库并配置好开发环境。通常情况下,需要使用 Python 和 PyTorch 来加载预训练模型并对齐进行 LoRA 微调[^1]。 ```bash pip install torch transformers peft datasets accelerate ``` #### 2. 加载基础模型 利用 Hugging Face 提供的工具链加载目标模型,并将其转换为适合 `llama.cpp` 格式的文件。例如,可以通过以下方式加载 Qwen 或其他 LLaMA 变体模型[^2]: ```python from transformers import AutoModelForCausalLM, AutoTokenizer model_name_or_path = "your-model-path" tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) base_model = AutoModelForCausalLM.from_pretrained( model_name_or_path, load_in_8bit=False, device_map="auto", trust_remote_code=True ) ``` #### 3. 配置 LoRA 参数 定义 LoRA 的超参数设置,这些参数决定了适应层的具体结构及其影响范围。常见的做法如下所示[^1]: ```python from peft import LoraConfig, get_peft_model lora_config = LoraConfig( r=16, # Rank of the update matrices. lora_alpha=32, # Scaling factor for rank decomposition. target_modules=["q_proj", "v_proj"], # Specify which modules to apply LoRA on (e.g., query and value projections). lora_dropout=0.05, bias="none", task_type="CAUSAL_LM" # Task type should match your application scenario here it is causal language modeling. ) peft_model = get_peft_model(base_model, lora_config) print(peft_model.print_trainable_parameters()) ``` #### 4. 数据准备与 Fine-Tuning 构建数据集用于后续训练阶段,在此过程中需注意输入序列长度限制等问题。接着执行实际优化操作直至收敛为止[^2]: ```python import bitsandbytes as bnb from transformers import TrainingArguments, Trainer training_args = TrainingArguments( output_dir="./results", num_train_epochs=3, per_device_train_batch_size=4, gradient_accumulation_steps=2, optim="adamw_torch_fused", logging_dir='./logs', save_strategy='epoch' ) trainer = Trainer( model=peft_model, args=training_args, train_dataset=train_data, tokenizer=tokenizer ) trainer.train() ``` #### 5. 转换至 GGUF 格式 当完成上述步骤之后,可进一步将得到的结果导出成兼容于 `llama.cpp` 工具链使用的 `.gguf` 文件格式以便部署应用[^1]: ```bash ./llama-quantize \ ./models/1.5B/qwen2-1.5b-question2-fp16.gguf \ ./models/1.5B/qwen2-1.5b-question2-q4_0.gguf \ q4_0 ``` --- ### 注意事项 - 上述流程假定读者已经具备一定机器学习背景知识并且熟悉命令行操作。 - 实际项目实施期间可能还会遇到更多细节问题比如硬件资源分配等,则视具体情况调整方案设计。 问题
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Yongqiang Cheng

梦想不是浮躁,而是沉淀和积累。

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值