-
Notifications
You must be signed in to change notification settings - Fork 577
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 请问你使用的是哪个版本的torch?我用的2.1.0 2.3.1 2.4的都报这个错 #194
Description
Traceback (most recent call last):
File "/home/caslx/qwen/conda/envs/liudongze/bin/lightning", line 8, in
sys.exit(_cli_entry_point())
File "/home/caslx/qwen/conda/envs/liudongze/lib/python3.9/site-packages/lightning/init.py", line 47, in _cli_entry_point
main()
File "/home/caslx/qwen/conda/envs/liudongze/lib/python3.9/site-packages/lightning/app/cli/lightning_cli.py", line 109, in main
_main()
File "/home/caslx/qwen/conda/envs/liudongze/lib/python3.9/site-packages/click/core.py", line 1157, in call
return self.main(*args, **kwargs)
File "/home/caslx/qwen/conda/envs/liudongze/lib/python3.9/site-packages/click/core.py", line 1078, in main
rv = self.invoke(ctx)
File "/home/caslx/qwen/conda/envs/liudongze/lib/python3.9/site-packages/lightning/app/utilities/exceptions.py", line 37, in invoke
return super().invoke(ctx)
File "/home/caslx/qwen/conda/envs/liudongze/lib/python3.9/site-packages/click/core.py", line 1688, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/home/caslx/qwen/conda/envs/liudongze/lib/python3.9/site-packages/click/core.py", line 1688, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/home/caslx/qwen/conda/envs/liudongze/lib/python3.9/site-packages/click/core.py", line 1434, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/home/caslx/qwen/conda/envs/liudongze/lib/python3.9/site-packages/click/core.py", line 783, in invoke
return __callback(*args, **kwargs)
File "/home/caslx/qwen/conda/envs/liudongze/lib/python3.9/site-packages/lightning/fabric/cli.py", line 129, in _run_model
main(args=Namespace(**kwargs), script_args=script_args)
File "/home/caslx/qwen/conda/envs/liudongze/lib/python3.9/site-packages/lightning/fabric/cli.py", line 187, in main
_torchrun_launch(args, script_args or [])
File "/home/caslx/qwen/conda/envs/liudongze/lib/python3.9/site-packages/lightning/fabric/cli.py", line 182, in _torchrun_launch
torchrun.main(torchrun_args)
File "/home/caslx/qwen/conda/envs/liudongze/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 347, in wrapper
return f(*args, **kwargs)
File "/home/caslx/qwen/conda/envs/liudongze/lib/python3.9/site-packages/torch/distributed/run.py", line 879, in main
run(args)
File "/home/caslx/qwen/conda/envs/liudongze/lib/python3.9/site-packages/torch/distributed/run.py", line 870, in run
elastic_launch(
File "/home/caslx/qwen/conda/envs/liudongze/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/caslx/qwen/conda/envs/liudongze/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: