-
Notifications
You must be signed in to change notification settings - Fork 229
Description
Hi there,
This is a brilinght ideal ,buy my 16G3080 semms does not work.
model = build_model(
device=device,
quant_config=quant_config,
offload_config=offload_config,
state_path=state_path,
)
/home/cc/.local/lib/python3.11/site-packages/torch/nn/init.py:452: UserWarning: Initializing zero-element tensors is a no-op
warnings.warn("Initializing zero-element tensors is a no-op")
RuntimeError Traceback (most recent call last)
Cell In[6], line 1
----> 1 model = build_model(
2 device=device,
3 quant_config=quant_config,
4 offload_config=offload_config,
5 state_path=state_path,
6 )
File /mnt/d/MyPeojects/18-mixtral-offloading/src/build_model.py:204, in build_model(device, quant_config, offload_config, state_path)
198 trunk_state_path = os.path.join(
199 state_path,
200 weight_map["model.embed_tokens.weight"],
201 )
202 model.load_state_dict(load_file(trunk_state_path, device=str(device)), strict=True)
--> 204 expert_cache = ExpertCache(
205 make_module=_make_module,
206 main_size=offload_config.main_size,
207 offload_size=offload_config.offload_size,
208 buffer_size=offload_config.buffer_size,
209 )
210 for layer_idx in trange(model_config.num_hidden_layers, desc="Loading experts"):
211 curr_layer = model.model.layers[layer_idx]
File /mnt/d/MyPeojects/18-mixtral-offloading/src/expert_cache.py:67, in ExpertCache.init(self, make_module, main_size, offload_size, buffer_size)
64 self.main_infos: List[Optional[ExpertInfo]] = [None for _ in range(main_size)]
66 assert self.module_size is not None
---> 67 self.offloaded_storages = [
68 torch.UntypedStorage(self.module_size).pin_memory(self.device) for _ in range(offload_size)]
69 self.offloaded_infos: List[Optional[ExpertInfo]] = [None for _ in range(offload_size)]
71 # temporary storage to shave off latency
File /mnt/d/MyPeojects/18-mixtral-offloading/src/expert_cache.py:68, in (.0)
64 self.main_infos: List[Optional[ExpertInfo]] = [None for _ in range(main_size)]
66 assert self.module_size is not None
67 self.offloaded_storages = [
---> 68 torch.UntypedStorage(self.module_size).pin_memory(self.device) for _ in range(offload_size)]
69 self.offloaded_infos: List[Optional[ExpertInfo]] = [None for _ in range(offload_size)]
71 # temporary storage to shave off latency
File ~/.local/lib/python3.11/site-packages/torch/storage.py:235, in StorageBase.pin_memory(self, device)
231 if self.device.type != 'cpu':
232 raise TypeError(f"cannot pin '{self.type()}' only CPU memory can be pinned")
234 pinned_tensor = torch.tensor([], dtype=torch.uint8, device=self.device).set(
--> 235 cast(Storage, self)).pin_memory(device)
236 return pinned_tensor.untyped_storage()
RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
MixtralConfig {
"_name_or_path": "lavawolfiee/Mixtral-8x7B-Instruct-v0.1-offloading-demo",
"architectures": [
"MixtralForCausalLM"
],
"attention_dropout": 0.0,
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_size": 4096,
"initializer_range": 0.02,
"intermediate_size": 14336,
"max_position_embeddings": 32768,
"model_type": "mixtral",
"num_attention_heads": 32,
"num_experts_per_tok": 2,
"num_hidden_layers": 32,
"num_key_value_heads": 8,
"num_local_experts": 8,
"output_router_logits": false,
"rms_norm_eps": 1e-05,
"rope_theta": 1000000.0,
"router_aux_loss_coef": 0.02,
"sliding_window": null,
"tie_word_embeddings": false,
"torch_dtype": "float16",
"transformers_version": "4.36.1",
"use_cache": true,
"vocab_size": 32000
}
Can you provide me with some advice on this matter?
Thanks a bunch for any help you can offer! Looking forward to hearing back from you soon.