RuntimeError Traceback (most recent call last)
Cell In[16], line 3
1 generate_text = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer)
----> 3 generate_text("Look up the boiling point of water.")
File /opt/conda/envs/textgen/lib/python3.10/site-packages/transformers/pipelines/base.py:1109, in Pipeline.call(self, inputs, num_workers, batch_size, *args, **kwargs)
1101 return next(
1102 iter(
1103 self.get_iterator(
(...)
1106 )
1107 )
1108 else:
-> 1109 return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)
File /opt/conda/envs/textgen/lib/python3.10/site-packages/transformers/pipelines/base.py:1116, in Pipeline.run_single(self, inputs, preprocess_params, forward_params, postprocess_params)
1114 def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
1115 model_inputs = self.preprocess(inputs, **preprocess_params)
-> 1116 model_outputs = self.forward(model_inputs, **forward_params)
1117 outputs = self.postprocess(model_outputs, **postprocess_params)
1118 return outputs
File /opt/conda/envs/textgen/lib/python3.10/site-packages/transformers/pipelines/base.py:1015, in Pipeline.forward(self, model_inputs, **forward_params)
1013 with inference_context():
1014 model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
-> 1015 model_outputs = self._forward(model_inputs, **forward_params)
1016 model_outputs = self._ensure_tensor_on_device(model_outputs, device=torch.device("cpu"))
1017 else:
Cell In[1], line 98, in InstructionTextGenerationPipeline._forward(self, model_inputs, **generate_kwargs)
96 input_ids = model_inputs["input_ids"]
97 attention_mask = model_inputs.get("attention_mask", None)
---> 98 generated_sequence = self.model.generate(
99 input_ids=input_ids.to(self.model.device),
100 attention_mask=attention_mask,
101 pad_token_id=self.tokenizer.pad_token_id,
102 **generate_kwargs,
103 )[0].cpu()
104 instruction_text = model_inputs.pop("instruction_text")
105 return {"generated_sequence": generated_sequence, "input_ids": input_ids, "instruction_text": instruction_text}
File /opt/conda/envs/textgen/lib/python3.10/site-packages/peft/peft_model.py:627, in PeftModelForCausalLM.generate(self, **kwargs)
625 try:
626 if not isinstance(self.peft_config, PromptLearningConfig):
--> 627 outputs = self.base_model.generate(**kwargs)
628 else:
629 if "input_ids" not in kwargs:
File /opt/conda/envs/textgen/lib/python3.10/site-packages/peft/peft_model.py:627, in PeftModelForCausalLM.generate(self, **kwargs)
625 try:
626 if not isinstance(self.peft_config, PromptLearningConfig):
--> 627 outputs = self.base_model.generate(**kwargs)
628 else:
629 if "input_ids" not in kwargs:
File /opt/conda/envs/textgen/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File /opt/conda/envs/textgen/lib/python3.10/site-packages/transformers/generation/utils.py:1508, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, streamer, **kwargs)
1500 input_ids, model_kwargs = self._expand_inputs_for_generation(
1501 input_ids=input_ids,
1502 expand_size=generation_config.num_return_sequences,
1503 is_encoder_decoder=self.config.is_encoder_decoder,
1504 **model_kwargs,
1505 )
1507 # 13. run sample
-> 1508 return self.sample(
1509 input_ids,
1510 logits_processor=logits_processor,
1511 logits_warper=logits_warper,
1512 stopping_criteria=stopping_criteria,
1513 pad_token_id=generation_config.pad_token_id,
1514 eos_token_id=generation_config.eos_token_id,
1515 output_scores=generation_config.output_scores,
1516 return_dict_in_generate=generation_config.return_dict_in_generate,
1517 synced_gpus=synced_gpus,
1518 streamer=streamer,
1519 **model_kwargs,
1520 )
1522 elif is_beam_gen_mode:
1523 if generation_config.num_return_sequences > generation_config.num_beams:
File /opt/conda/envs/textgen/lib/python3.10/site-packages/transformers/generation/utils.py:2547, in GenerationMixin.sample(self, input_ids, logits_processor, stopping_criteria, logits_warper, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, streamer, **model_kwargs)
2544 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
2546 # forward pass to get next token
-> 2547 outputs = self(
2548 **model_inputs,
2549 return_dict=True,
2550 output_attentions=output_attentions,
2551 output_hidden_states=output_hidden_states,
2552 )
2554 if synced_gpus and this_peer_finished:
2555 continue # don't waste resources running the code we don't need
File /opt/conda/envs/textgen/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /opt/conda/envs/textgen/lib/python3.10/site-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)
File /opt/conda/envs/textgen/lib/python3.10/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:662, in GPTNeoXForCausalLM.forward(self, input_ids, attention_mask, position_ids, inputs_embeds, head_mask, past_key_values, labels, use_cache, output_attentions, output_hidden_states, return_dict)
621 r"""
622 past_key_values (tuple(tuple(torch.FloatTensor))
, optional, returned when use_cache=True
is passed or when config.use_cache=True
):
623 Tuple of tuple(torch.FloatTensor)
of length config.n_layers
, with each tuple having 2 tensors of shape
(...)
658 >>> prediction_logits = outputs.logits
659 ```"""
660 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--> 662 outputs = self.gpt_neox(
663 input_ids,
664 attention_mask=attention_mask,
665 position_ids=position_ids,
666 head_mask=head_mask,
667 inputs_embeds=inputs_embeds,
668 past_key_values=past_key_values,
669 use_cache=use_cache,
670 output_attentions=output_attentions,
671 output_hidden_states=output_hidden_states,
672 return_dict=return_dict,
673 )
675 hidden_states = outputs[0]
676 lm_logits = self.embed_out(hidden_states)
File /opt/conda/envs/textgen/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /opt/conda/envs/textgen/lib/python3.10/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:553, in GPTNeoXModel.forward(self, input_ids, attention_mask, position_ids, head_mask, inputs_embeds, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
545 outputs = torch.utils.checkpoint.checkpoint(
546 create_custom_forward(layer),
547 hidden_states,
(...)
550 head_mask[i],
551 )
552 else:
--> 553 outputs = layer(
554 hidden_states,
555 attention_mask=attention_mask,
556 position_ids=position_ids,
557 head_mask=head_mask[i],
558 layer_past=layer_past,
559 use_cache=use_cache,
560 output_attentions=output_attentions,
561 )
562 hidden_states = outputs[0]
563 if use_cache is True:
File /opt/conda/envs/textgen/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /opt/conda/envs/textgen/lib/python3.10/site-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)
File /opt/conda/envs/textgen/lib/python3.10/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:320, in GPTNeoXLayer.forward(self, hidden_states, attention_mask, position_ids, head_mask, use_cache, layer_past, output_attentions)
310 def forward(
311 self,
312 hidden_states: Optional[torch.FloatTensor],
(...)
318 output_attentions: Optional[bool] = False,
319 ):
--> 320 attention_layer_outputs = self.attention(
321 self.input_layernorm(hidden_states),
322 attention_mask=attention_mask,
323 position_ids=position_ids,
324 layer_past=layer_past,
325 head_mask=head_mask,
326 use_cache=use_cache,
327 output_attentions=output_attentions,
328 )
329 attn_output = attention_layer_outputs[0] # output_attn: attn_output, present, (attn_weights)
330 outputs = attention_layer_outputs[1:]
File /opt/conda/envs/textgen/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /opt/conda/envs/textgen/lib/python3.10/site-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)
File /opt/conda/envs/textgen/lib/python3.10/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:116, in GPTNeoXAttention.forward(self, hidden_states, attention_mask, position_ids, head_mask, layer_past, use_cache, output_attentions)
111 has_layer_past = layer_past is not None
113 # Compute QKV
114 # Attention heads [batch, seq_len, hidden_size]
115 # --> [batch, seq_len, (np * 3 * head_size)]
--> 116 qkv = self.query_key_value(hidden_states)
118 # [batch, seq_len, (num_heads * 3 * head_size)]
119 # --> [batch, seq_len, num_heads, 3 * head_size]
120 new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size)
File /opt/conda/envs/textgen/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /opt/conda/envs/textgen/lib/python3.10/site-packages/peft/tuners/lora.py:530, in MergedLinear.forward(self, x)
528 result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
529 if self.r > 0:
--> 530 after_A = self.lora_A(self.lora_dropout(x))
531 after_B = self.lora_B(after_A.transpose(-2, -1)).transpose(-2, -1)
532 result += self.zero_pad(after_B) * self.scaling
File /opt/conda/envs/textgen/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /opt/conda/envs/textgen/lib/python3.10/site-packages/torch/nn/modules/linear.py:114, in Linear.forward(self, input)
113 def forward(self, input: Tensor) -> Tensor:
--> 114 return F.linear(input, self.weight, self.bias)
RuntimeError: expected scalar type BFloat16 but found Float