Fix OOM regression in _apply() for quantized models during inference (#13372)

jkyamog · web-flow · commit 1de83f91c34a · 2026-04-15T02:10:36.000-07:00
Skip unnecessary clone of inference-mode tensors when already inside torch.inference_mode(), matching the existing guard in set_attr_param. The unconditional clone introduced in 20561aa caused transient VRAM doubling during model movement for FP8/quantized models.
diff --git a/comfy/ops.py b/comfy/ops.py
@@ -1151,7 +1151,7 @@ def _apply(self, fn, recurse=True):  # This is to get torch.compile + moving wei
                     if param is None:
                         continue
                     p = fn(param)
-                    if p.is_inference():
+                    if (not torch.is_inference_mode_enabled()) and p.is_inference():
                         p = p.clone()
                     self.register_parameter(key, torch.nn.Parameter(p, requires_grad=False))
                 for key, buf in self._buffers.items():