machine-learningpytorchhuggingface-transformershuggingface

Getting Cuda out of memory when importing microsoft/Orca-2-13b from hugging faces


I am using Ubuntu 24.04.1 on an AWS EC2 instance g5.8xlarge.

I am receiving the following error message:

OutOfMemoryError: Allocation on device 

Code:

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
import torch
torch.cuda.empty_cache()
import transformers
    
if torch.cuda.is_available():
    torch.set_default_device("cuda")
    
device = torch.device("cuda")
    
model = transformers.AutoModelForCausalLM.from_pretrained("microsoft/Orca-2-13b", device_map=device)

Full error:

/home/ubuntu/anaconda3/envs/ai/lib/python3.12/site-packages/torch/cuda/__init__.py:734: UserWarning: Can't initialize NVML
  warnings.warn("Can't initialize NVML")

Loading checkpoint shards:  33%
 2/6 [00:04<00:06,  1.72s/it]

/home/ubuntu/anaconda3/envs/ai/lib/python3.12/site-packages/torch/cuda/__init__.py:734: UserWarning: Can't initialize NVML
  warnings.warn("Can't initialize NVML")

---------------------------------------------------------------------------
OutOfMemoryError                          Traceback (most recent call last)
Cell In[5], line 6
      2     torch.set_default_device("cuda")
      4 device = torch.device("cuda")
----> 6 model = transformers.AutoModelForCausalLM.from_pretrained("microsoft/Orca-2-13b", device_map=device)
      8 # https://github.com/huggingface/transformers/issues/27132
      9 # please use the slow tokenizer since fast and slow tokenizer produces different tokens
     10 tokenizer = transformers.AutoTokenizer.from_pretrained(
     11         "microsoft/Orca-2-13b",
     12         use_fast=True,
     13     )

File ~/anaconda3/envs/ai/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py:564, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
    562 elif type(config) in cls._model_mapping.keys():
    563     model_class = _get_model_class(config, cls._model_mapping)
--> 564     return model_class.from_pretrained(
    565         pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
    566     )
    567 raise ValueError(
    568     f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
    569     f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
    570 )

File ~/anaconda3/envs/ai/lib/python3.12/site-packages/transformers/modeling_utils.py:262, in restore_default_torch_dtype.<locals>._wrapper(*args, **kwargs)
    260 old_dtype = torch.get_default_dtype()
    261 try:
--> 262     return func(*args, **kwargs)
    263 finally:
    264     torch.set_default_dtype(old_dtype)

File ~/anaconda3/envs/ai/lib/python3.12/site-packages/transformers/modeling_utils.py:4319, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, weights_only, *model_args, **kwargs)
   4309     if dtype_orig is not None:
   4310         torch.set_default_dtype(dtype_orig)
   4312     (
   4313         model,
   4314         missing_keys,
   4315         unexpected_keys,
   4316         mismatched_keys,
   4317         offload_index,
   4318         error_msgs,
-> 4319     ) = cls._load_pretrained_model(
   4320         model,
   4321         state_dict,
   4322         loaded_state_dict_keys,  # XXX: rename?
   4323         resolved_archive_file,
   4324         pretrained_model_name_or_path,
   4325         ignore_mismatched_sizes=ignore_mismatched_sizes,
   4326         sharded_metadata=sharded_metadata,
   4327         _fast_init=_fast_init,
   4328         low_cpu_mem_usage=low_cpu_mem_usage,
   4329         device_map=device_map,
   4330         offload_folder=offload_folder,
   4331         offload_state_dict=offload_state_dict,
   4332         dtype=torch_dtype,
   4333         hf_quantizer=hf_quantizer,
   4334         keep_in_fp32_modules=keep_in_fp32_modules,
   4335         gguf_path=gguf_path,
   4336         weights_only=weights_only,
   4337     )
   4339 # make sure token embedding weights are still tied if needed
   4340 model.tie_weights()

File ~/anaconda3/envs/ai/lib/python3.12/site-packages/transformers/modeling_utils.py:4897, in PreTrainedModel._load_pretrained_model(cls, model, state_dict, loaded_keys, resolved_archive_file, pretrained_model_name_or_path, ignore_mismatched_sizes, sharded_metadata, _fast_init, low_cpu_mem_usage, device_map, offload_folder, offload_state_dict, dtype, hf_quantizer, keep_in_fp32_modules, gguf_path, weights_only)
   4895     else:
   4896         fixed_state_dict = cls._fix_state_dict_keys_on_load(state_dict)
-> 4897         new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
   4898             model_to_load,
   4899             fixed_state_dict,
   4900             start_prefix,
   4901             expected_keys,
   4902             device_map=device_map,
   4903             offload_folder=offload_folder,
   4904             offload_index=offload_index,
   4905             state_dict_folder=state_dict_folder,
   4906             state_dict_index=state_dict_index,
   4907             dtype=dtype,
   4908             hf_quantizer=hf_quantizer,
   4909             is_safetensors=is_safetensors,
   4910             keep_in_fp32_modules=keep_in_fp32_modules,
   4911             unexpected_keys=unexpected_keys,
   4912         )
   4913         error_msgs += new_error_msgs
   4914 else:
   4915     # Sharded checkpoint or whole but low_cpu_mem_usage==True

File ~/anaconda3/envs/ai/lib/python3.12/site-packages/transformers/modeling_utils.py:896, in _load_state_dict_into_meta_model(model, state_dict, start_prefix, expected_keys, device_map, offload_folder, offload_index, state_dict_folder, state_dict_index, dtype, hf_quantizer, is_safetensors, keep_in_fp32_modules, unexpected_keys, pretrained_model_name_or_path)
    893         param_device = "cpu" if is_local_dist_rank_0() else "meta"
    895     # For backward compatibility with older versions of `accelerate` and for non-quantized params
--> 896     set_module_tensor_to_device(model, param_name, param_device, **set_module_kwargs)
    897 else:
    898     hf_quantizer.create_quantized_param(model, param, param_name, param_device, state_dict, unexpected_keys)

File ~/anaconda3/envs/ai/lib/python3.12/site-packages/accelerate/utils/modeling.py:330, in set_module_tensor_to_device(module, tensor_name, device, value, dtype, fp16_statistics, tied_params_map)
    328             module._parameters[tensor_name] = param_cls(new_value, requires_grad=old_value.requires_grad)
    329 elif isinstance(value, torch.Tensor):
--> 330     new_value = value.to(device)
    331 else:
    332     new_value = torch.tensor(value, device=device)

File ~/anaconda3/envs/ai/lib/python3.12/site-packages/torch/utils/_device.py:104, in DeviceContext.__torch_function__(self, func, types, args, kwargs)
    102 if func in _device_constructors() and kwargs.get('device') is None:
    103     kwargs['device'] = self.device
--> 104 return func(*args, **kwargs)

OutOfMemoryError: Allocation on device 

Solution

  • You can check out information on the specific model here. But you can see it requires 52.1 GB of VRAM (GPU memory).

    Based on this table we see that you have 24GB of GPU memory. So it won't be able to fit. If you aren't able to get more GPU memory, you can look into quantized models.

    You can check out the models on huggingface that have quantized versions, the GPU memory required, and the best use case.