Awesome project. Thanks so much for creating a foss alpaca codebase.

Will this support 4bit quantisation? about alpaca-lora HOT 4 CLOSED

tloen commented on July 24, 2024 4

Will this support 4bit quantisation?

from alpaca-lora.

Comments (4)

johnsmith0031 commented on July 24, 2024 11

I think technically we can do both, with origin model weight in int4+fp16/fp32 and lora weight of fp32. The dtype of output from QuantLinear layer can be exactly the dtype of its input, so inserting LoRA layer after every QuantLinear layer won't be very difficult.

Made an adapter for peft to support QuantLinear,

from quant import QuantLinear

class Linear4bitLt(QuantLinear, LoraLayer):
    # Lora implemented in a dense layer
    def __init__(
            self,
            in_features,
            out_features,
            r: int = 0,
            lora_alpha: int = 1,
            lora_dropout: float = 0.0,
            **kwargs,
    ):
        QuantLinear.__init__(
            self,
            4,
            in_features,
            out_features
        )
        LoraLayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, merge_weights=False)
        # Actual trainable parameters
        if r > 0:
            self.lora_A = nn.Linear(in_features, r, bias=False)
            self.lora_B = nn.Linear(r, out_features, bias=False)
            self.scaling = self.lora_alpha / self.r
            # Freezing the pre-trained weight matrix
            self.qweight.requires_grad = False
            self.scales.requires_grad = False
            self.zeros.requires_grad = False
            self.bias.requires_grad = False
        self.reset_parameters()

    def reset_parameters(self):
        if hasattr(self, "lora_A"):
            # initialize A the same way as the default for nn.Linear and B to zero
            nn.init.kaiming_uniform_(self.lora_A.weight, a=math.sqrt(5))
            nn.init.zeros_(self.lora_B.weight)

    def forward(self, x: torch.Tensor):
        result = super().forward(x)

        if self.disable_adapters:
            return result
        elif self.r > 0:
            if not torch.is_autocast_enabled():
                expected_dtype = result.dtype

                if x.dtype != torch.float32:
                    x = x.float()
                output = self.lora_B(self.lora_A(self.lora_dropout(x))).to(expected_dtype) * self.scaling
                result += output
            else:
                output = self.lora_B(self.lora_A(self.lora_dropout(x))) * self.scaling
                result += output
        return result

The only thing left to do is to add support for gradient backward on 4bit matmul.

from alpaca-lora.