-
Notifications
You must be signed in to change notification settings - Fork 161
Open
Description
> [rank1]: File "/mnt/nj-dev-data-image-text2image-sdb/project/x-flux/train_flux_deepspeed.py", line 303, in <module>
> [rank1]: main()
> [rank1]: File "/mnt/nj-dev-data-image-text2image-sdb/project/x-flux/train_flux_deepspeed.py", line 232, in main
> [rank1]: model_pred = dit(img=x_t.to(weight_dtype),
> [rank1]: File "/root/mambaforge/envs/gwb_kohya/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
> [rank1]: return self._call_impl(*args, **kwargs)
> [rank1]: File "/root/mambaforge/envs/gwb_kohya/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
> [rank1]: return forward_call(*args, **kwargs)
> [rank1]: File "/root/mambaforge/envs/gwb_kohya/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn
> [rank1]: ret_val = func(*args, **kwargs)
> [rank1]: File "/root/mambaforge/envs/gwb_kohya/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1987, in forward
> [rank1]: loss = self.module(*inputs, **kwargs)
> [rank1]: File "/root/mambaforge/envs/gwb_kohya/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
> [rank1]: return self._call_impl(*args, **kwargs)
> [rank1]: File "/root/mambaforge/envs/gwb_kohya/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
> [rank1]: return forward_call(*args, **kwargs)
> [rank1]: File "/mnt/nj-dev-data-image-text2image-sdb/project/x-flux/src/flux/model.py", line 190, in forward
> [rank1]: img, txt = block(
> [rank1]: File "/root/mambaforge/envs/gwb_kohya/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
> [rank1]: return self._call_impl(*args, **kwargs)
> [rank1]: File "/root/mambaforge/envs/gwb_kohya/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
> [rank1]: return forward_call(*args, **kwargs)
> [rank1]: File "/mnt/nj-dev-data-image-text2image-sdb/project/x-flux/src/flux/modules/layers.py", line 387, in forward
> [rank1]: return self.processor(self, img, txt, vec, pe)
> [rank1]: File "/mnt/nj-dev-data-image-text2image-sdb/project/x-flux/src/flux/modules/layers.py", line 310, in __call__
> [rank1]: img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
> [rank1]: RuntimeError: The size of tensor a (77) must match the size of tensor b (3072) at non-singleton dimension 2
> W0514 19:57:40.544000 140052167190336 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 1493136 closing signal SIGTERM
> E0514 19:57:43.265000 140052167190336 torch/distributed/elastic/multiprocessing/api.py:833] failed (exitcode: 1) local_rank: 1 (pid: 1493137) of binary: /root/mambaforge/envs/gwb_kohya/bin/python3.10
> Traceback (most recent call last):
> File "/root/mambaforge/envs/gwb_kohya/bin/accelerate", line 8, in <module>
> sys.exit(main())
> File "/root/mambaforge/envs/gwb_kohya/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 48, in main
> args.func(args)
> File "/root/mambaforge/envs/gwb_kohya/lib/python3.10/site-packages/accelerate/commands/launch.py", line 1153, in launch_command
> deepspeed_launcher(args)
> File "/root/mambaforge/envs/gwb_kohya/lib/python3.10/site-packages/accelerate/commands/launch.py", line 846, in deepspeed_launcher
> distrib_run.run(args)
> File "/root/mambaforge/envs/gwb_kohya/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
> elastic_launch(
> File "/root/mambaforge/envs/gwb_kohya/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 133, in __call__
> return launch_agent(self._config, self._entrypoint, list(args))
> File "/root/mambaforge/envs/gwb_kohya/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
> raise ChildFailedError(
> torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
> ============================================================
> train_flux_deepspeed.py FAILED
> ------------------------------------------------------------
> Failures:
> <NO_OTHER_FAILURES>
> ------------------------------------------------------------
> Root Cause (first observed failure):
> [0]:
> time : 2025-05-14_19:57:40
> host : afdbe3e33394
> rank : 1 (local_rank: 1)
> exitcode : 1 (pid: 1493137)
> error_file: <N/A>
> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.htmlMetadata
Metadata
Assignees
Labels
No labels