Skip to content

Commit 078822d

Browse files
committed
Fix data path in multi gpu notebook
1 parent 1be43a3 commit 078822d

File tree

2 files changed

+51
-54
lines changed

2 files changed

+51
-54
lines changed

examples/02-Multi-GPU-Tensorflow-with-Horovod.ipynb

Lines changed: 50 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,8 @@
7878
"name": "stderr",
7979
"output_type": "stream",
8080
"text": [
81-
"downloading ml-25m.zip: 262MB [00:10, 24.4MB/s] \n",
82-
"unzipping files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:04<00:00, 1.60files/s]\n"
81+
"downloading ml-25m.zip: 262MB [00:10, 24.4MB/s] \n",
82+
"unzipping files: 100%|█████████████████████████████| 8/8 [00:04<00:00, 1.61files/s]\n"
8383
]
8484
}
8585
],
@@ -155,6 +155,7 @@
155155
"source": [
156156
"%%writefile \"./tf_trainer.py\"\n",
157157
"\n",
158+
"import argparse\n",
158159
"import os\n",
159160
"\n",
160161
"# the order of statements and imports is imoportant\n",
@@ -180,15 +181,20 @@
180181
"\n",
181182
"from merlin.loader.tensorflow import Loader\n",
182183
"\n",
184+
"parser = argparse.ArgumentParser()\n",
185+
"parser.add_argument(\"--data_path\", default=None, help=\"Input directory.\")\n",
186+
"parser.add_argument(\"--batch_size\", default=None, help=\"Batch size.\")\n",
187+
"args = parser.parse_args()\n",
183188
"\n",
184-
"DATA_PATH = os.getenv(\"DATA_PATH\", os.path.expanduser(\"~/workspace\"))\n",
189+
"DATA_PATH = args.data_path or os.path.expanduser(\"~/workspace\")\n",
190+
"BATCH_SIZE = args.batch_size or 1024\n",
185191
"\n",
186192
"dataset = Dataset(os.path.join(DATA_PATH, \"train.parquet\"))\n",
187193
"dataset = dataset.repartition(MPI_SIZE)\n",
188194
"\n",
189195
"loader = Loader(\n",
190196
" dataset,\n",
191-
" batch_size=64 * 1024,\n",
197+
" batch_size=BATCH_SIZE,\n",
192198
" global_size=MPI_SIZE,\n",
193199
" global_rank=MPI_RANK,\n",
194200
" device=MPI_RANK,\n",
@@ -280,7 +286,7 @@
280286
},
281287
{
282288
"cell_type": "code",
283-
"execution_count": 6,
289+
"execution_count": null,
284290
"id": "ec5e9b7f",
285291
"metadata": {
286292
"scrolled": true
@@ -290,58 +296,49 @@
290296
"name": "stdout",
291297
"output_type": "stream",
292298
"text": [
293-
"2023-06-03 21:35:18.892140: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
294-
"2023-06-03 21:35:18.932879: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
295-
"To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
296-
"[1,1]<stderr>:2023-06-03 21:35:23.549563: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
297-
"[1,0]<stderr>:2023-06-03 21:35:23.568539: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
298-
"[1,1]<stderr>:2023-06-03 21:35:23.592349: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
299-
"[1,1]<stderr>:To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
300-
"[1,0]<stderr>:2023-06-03 21:35:23.609861: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
301-
"[1,0]<stderr>:To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
302-
"[1,1]<stderr>:2023-06-03 21:35:28.092241: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:226] Using CUDA malloc Async allocator for GPU: 0\n",
303-
"[1,1]<stderr>:2023-06-03 21:35:28.092336: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1638] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24337 MB memory: -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:b3:00.0, compute capability: 8.6\n",
304-
"[1,0]<stderr>:2023-06-03 21:35:28.141988: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:226] Using CUDA malloc Async allocator for GPU: 0\n",
305-
"[1,0]<stderr>:2023-06-03 21:35:28.142076: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1638] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24338 MB memory: -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:17:00.0, compute capability: 8.6\n",
306-
"[1,0]<stderr>:2023-06-03 21:35:32.089463: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7f1150020480 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:\n",
307-
"[1,0]<stderr>:2023-06-03 21:35:32.089532: I tensorflow/compiler/xla/service/service.cc:177] StreamExecutor device (0): NVIDIA RTX A6000, Compute Capability 8.6\n",
308-
"[1,1]<stderr>:2023-06-03 21:35:32.089552: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x79f8590 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:\n",
309-
"[1,1]<stderr>:2023-06-03 21:35:32.089613: I tensorflow/compiler/xla/service/service.cc:177] StreamExecutor device (0): NVIDIA RTX A6000, Compute Capability 8.6\n",
310-
"[1,1]<stderr>:2023-06-03 21:35:32.101885: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
311-
"[1,0]<stderr>:2023-06-03 21:35:32.102268: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
312-
"[1,1]<stderr>:2023-06-03 21:35:33.637854: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8900\n",
313-
"[1,0]<stderr>:2023-06-03 21:35:33.648275: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8900\n",
314-
"[1,1]<stderr>:2023-06-03 21:35:33.834015: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.\n",
315-
"[1,0]<stderr>:2023-06-03 21:35:33.854743: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.\n",
316-
"[1,0]<stdout>:Step #0\tLoss: 13.976147\n",
317-
"[1,0]<stdout>:Step #10\tLoss: 13.746956\n",
318-
"[1,0]<stdout>:Step #20\tLoss: 13.907515\n",
319-
"[1,0]<stdout>:Step #30\tLoss: 14.084653\n",
320-
"[1,0]<stdout>:Step #40\tLoss: 13.346972\n",
321-
"[1,0]<stdout>:Step #50\tLoss: 13.931261\n",
322-
"[1,0]<stdout>:Step #60\tLoss: 13.707795\n",
323-
"[1,0]<stdout>:Step #70\tLoss: 13.510033\n",
324-
"[1,0]<stdout>:Step #80\tLoss: 13.372274\n",
325-
"[1,0]<stdout>:Step #90\tLoss: 13.713926\n",
326-
"[1,0]<stdout>:Step #100\tLoss: 13.236437\n",
327-
"[1,0]<stdout>:Step #110\tLoss: 13.265822\n",
328-
"[1,0]<stdout>:Step #120\tLoss: 13.991277\n",
329-
"[1,0]<stdout>:Step #130\tLoss: 14.069466\n",
330-
"[1,0]<stdout>:Step #140\tLoss: 13.635876\n",
331-
"[1,0]<stdout>:Step #150\tLoss: 13.416016\n",
332-
"[1,0]<stdout>:Step #160\tLoss: 13.216636\n",
333-
"[1,0]<stdout>:Step #170\tLoss: 12.776440\n",
334-
"[1,0]<stdout>:Step #180\tLoss: 13.570569\n",
335-
"[1,0]<stdout>:Step #190\tLoss: 13.868576\n",
336-
"[1,1]<stderr>:/usr/local/lib/python3.8/dist-packages/merlin/dtypes/mappings/torch.py:43: UserWarning: PyTorch dtype mappings did not load successfully due to an error: No module named 'torch'\n",
337-
"[1,1]<stderr>: warn(f\"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}\")\n",
338-
"[1,0]<stderr>:/usr/local/lib/python3.8/dist-packages/merlin/dtypes/mappings/torch.py:43: UserWarning: PyTorch dtype mappings did not load successfully due to an error: No module named 'torch'\n",
339-
"[1,0]<stderr>: warn(f\"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}\")\n"
299+
"2023-06-08 04:04:31.525132: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n",
300+
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
301+
"2023-06-08 04:04:31.640485: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
302+
"[1,0]<stderr>:2023-06-08 04:04:34.931845: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n",
303+
"[1,0]<stderr>:To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
304+
"[1,1]<stderr>:2023-06-08 04:04:34.939388: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n",
305+
"[1,1]<stderr>:To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
306+
"[1,1]<stderr>:2023-06-08 04:04:35.046788: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
307+
"[1,0]<stderr>:2023-06-08 04:04:35.046788: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
308+
"[1,1]<stderr>:2023-06-08 04:04:41.286722: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n",
309+
"[1,1]<stderr>:2023-06-08 04:04:41.286821: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24570 MB memory: -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:b3:00.0, compute capability: 8.6\n",
310+
"[1,0]<stderr>:2023-06-08 04:04:41.292086: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n",
311+
"[1,0]<stderr>:2023-06-08 04:04:41.292173: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24570 MB memory: -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:17:00.0, compute capability: 8.6\n",
312+
"[1,1]<stderr>:WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/pyct/static_analysis/liveness.py:83: Analyzer.lamba_check (from tensorflow.python.autograph.pyct.static_analysis.liveness) is deprecated and will be removed after 2023-09-23.\n",
313+
"[1,1]<stderr>:Instructions for updating:\n",
314+
"[1,1]<stderr>:Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089\n",
315+
"[1,0]<stderr>:WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/pyct/static_analysis/liveness.py:83: Analyzer.lamba_check (from tensorflow.python.autograph.pyct.static_analysis.liveness) is deprecated and will be removed after 2023-09-23.\n",
316+
"[1,0]<stderr>:Instructions for updating:\n",
317+
"[1,0]<stderr>:Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089\n",
318+
"[1,0]<stdout>:Step #0\tLoss: 13.976286\n",
319+
"[1,0]<stdout>:Step #10\tLoss: 13.746111\n",
320+
"[1,0]<stdout>:Step #20\tLoss: 13.905323\n",
321+
"[1,0]<stdout>:Step #30\tLoss: 14.093473\n",
322+
"[1,0]<stdout>:Step #40\tLoss: 13.336206\n",
323+
"[1,0]<stdout>:Step #50\tLoss: 13.932583\n",
324+
"[1,0]<stdout>:Step #60\tLoss: 13.702780\n",
325+
"[1,0]<stdout>:Step #70\tLoss: 13.522057\n",
326+
"[1,0]<stdout>:Step #80\tLoss: 13.382860\n",
327+
"[1,0]<stdout>:Step #90\tLoss: 13.701270\n",
328+
"[1,0]<stdout>:Step #100\tLoss: 13.240610\n",
329+
"[1,0]<stdout>:Step #110\tLoss: 13.264977\n",
330+
"[1,0]<stdout>:Step #120\tLoss: 13.984927\n",
331+
"[1,0]<stdout>:Step #130\tLoss: 14.039978\n",
332+
"[1,0]<stdout>:Step #140\tLoss: 13.639907\n",
333+
"[1,0]<stdout>:Step #150\tLoss: 13.430090\n",
334+
"[1,0]<stdout>:Step #160\tLoss: 13.219415\n",
335+
"[1,0]<stdout>:Step #170\tLoss: 12.758451\n",
336+
"[1,0]<stdout>:Step #180\tLoss: 13.592442\n"
340337
]
341338
}
342339
],
343340
"source": [
344-
"!horovodrun -np {GPU_COUNT} python tf_trainer.py"
341+
"!horovodrun -np {GPU_COUNT} python tf_trainer.py --data_path={DATA_PATH} --batch_size=65536"
345342
]
346343
},
347344
{

tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,6 @@ def test_getting_started_tensorflow(tb, tmpdir):
5656
process.wait()
5757
stdout, stderr = process.communicate()
5858
print(stdout, stderr)
59-
assert "Loss" in str(stdout)
59+
assert "Loss:" in str(stdout)
6060

6161
assert any(f.startswith("checkpoints-") for f in os.listdir(os.getcwd()))

0 commit comments

Comments
 (0)