|
2 | 2 | "cells": [ |
3 | 3 | { |
4 | 4 | "cell_type": "code", |
5 | | - "execution_count": 1, |
| 5 | + "execution_count": null, |
6 | 6 | "id": "bb28e271", |
7 | 7 | "metadata": {}, |
8 | 8 | "outputs": [], |
|
57 | 57 | }, |
58 | 58 | { |
59 | 59 | "cell_type": "code", |
60 | | - "execution_count": 2, |
| 60 | + "execution_count": null, |
61 | 61 | "id": "edd46306", |
62 | 62 | "metadata": {}, |
63 | 63 | "outputs": [], |
|
70 | 70 | }, |
71 | 71 | { |
72 | 72 | "cell_type": "code", |
73 | | - "execution_count": 3, |
| 73 | + "execution_count": null, |
74 | 74 | "id": "591f8c61", |
75 | 75 | "metadata": {}, |
76 | | - "outputs": [ |
77 | | - { |
78 | | - "name": "stderr", |
79 | | - "output_type": "stream", |
80 | | - "text": [ |
81 | | - "downloading ml-25m.zip: 262MB [00:10, 24.4MB/s] \n", |
82 | | - "unzipping files: 100%|█████████████████████████████| 8/8 [00:04<00:00, 1.61files/s]\n" |
83 | | - ] |
84 | | - } |
85 | | - ], |
| 76 | + "outputs": [], |
86 | 77 | "source": [ |
87 | 78 | "DATA_PATH = os.environ.get(\"DATA_PATH\", os.path.expanduser(\"~/workspace\"))\n", |
88 | 79 | "download_file(\"http://files.grouplens.org/datasets/movielens/ml-25m.zip\", DATA_PATH + \"/ml-25m.zip\")" |
|
110 | 101 | }, |
111 | 102 | { |
112 | 103 | "cell_type": "code", |
113 | | - "execution_count": 4, |
| 104 | + "execution_count": null, |
114 | 105 | "id": "c65e5ef6", |
115 | 106 | "metadata": {}, |
116 | 107 | "outputs": [], |
|
140 | 131 | }, |
141 | 132 | { |
142 | 133 | "cell_type": "code", |
143 | | - "execution_count": 5, |
| 134 | + "execution_count": null, |
144 | 135 | "id": "9fbe17a7", |
145 | 136 | "metadata": {}, |
146 | | - "outputs": [ |
147 | | - { |
148 | | - "name": "stdout", |
149 | | - "output_type": "stream", |
150 | | - "text": [ |
151 | | - "Overwriting ./tf_trainer.py\n" |
152 | | - ] |
153 | | - } |
154 | | - ], |
| 137 | + "outputs": [], |
155 | 138 | "source": [ |
156 | 139 | "%%writefile \"./tf_trainer.py\"\n", |
157 | 140 | "\n", |
|
183 | 166 | "\n", |
184 | 167 | "parser = argparse.ArgumentParser()\n", |
185 | 168 | "parser.add_argument(\"--data_path\", default=None, help=\"Input directory.\")\n", |
186 | | - "parser.add_argument(\"--batch_size\", default=None, help=\"Batch size.\")\n", |
| 169 | + "parser.add_argument(\"--batch_size\", type=int, default=None, help=\"Batch size.\")\n", |
187 | 170 | "args = parser.parse_args()\n", |
188 | 171 | "\n", |
189 | 172 | "DATA_PATH = args.data_path or os.path.expanduser(\"~/workspace\")\n", |
|
291 | 274 | "metadata": { |
292 | 275 | "scrolled": true |
293 | 276 | }, |
294 | | - "outputs": [ |
295 | | - { |
296 | | - "name": "stdout", |
297 | | - "output_type": "stream", |
298 | | - "text": [ |
299 | | - "2023-06-08 04:04:31.525132: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", |
300 | | - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", |
301 | | - "2023-06-08 04:04:31.640485: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", |
302 | | - "[1,0]<stderr>:2023-06-08 04:04:34.931845: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", |
303 | | - "[1,0]<stderr>:To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", |
304 | | - "[1,1]<stderr>:2023-06-08 04:04:34.939388: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", |
305 | | - "[1,1]<stderr>:To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", |
306 | | - "[1,1]<stderr>:2023-06-08 04:04:35.046788: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", |
307 | | - "[1,0]<stderr>:2023-06-08 04:04:35.046788: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", |
308 | | - "[1,1]<stderr>:2023-06-08 04:04:41.286722: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n", |
309 | | - "[1,1]<stderr>:2023-06-08 04:04:41.286821: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24570 MB memory: -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:b3:00.0, compute capability: 8.6\n", |
310 | | - "[1,0]<stderr>:2023-06-08 04:04:41.292086: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n", |
311 | | - "[1,0]<stderr>:2023-06-08 04:04:41.292173: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24570 MB memory: -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:17:00.0, compute capability: 8.6\n", |
312 | | - "[1,1]<stderr>:WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/pyct/static_analysis/liveness.py:83: Analyzer.lamba_check (from tensorflow.python.autograph.pyct.static_analysis.liveness) is deprecated and will be removed after 2023-09-23.\n", |
313 | | - "[1,1]<stderr>:Instructions for updating:\n", |
314 | | - "[1,1]<stderr>:Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089\n", |
315 | | - "[1,0]<stderr>:WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/pyct/static_analysis/liveness.py:83: Analyzer.lamba_check (from tensorflow.python.autograph.pyct.static_analysis.liveness) is deprecated and will be removed after 2023-09-23.\n", |
316 | | - "[1,0]<stderr>:Instructions for updating:\n", |
317 | | - "[1,0]<stderr>:Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089\n", |
318 | | - "[1,0]<stdout>:Step #0\tLoss: 13.976286\n", |
319 | | - "[1,0]<stdout>:Step #10\tLoss: 13.746111\n", |
320 | | - "[1,0]<stdout>:Step #20\tLoss: 13.905323\n", |
321 | | - "[1,0]<stdout>:Step #30\tLoss: 14.093473\n", |
322 | | - "[1,0]<stdout>:Step #40\tLoss: 13.336206\n", |
323 | | - "[1,0]<stdout>:Step #50\tLoss: 13.932583\n", |
324 | | - "[1,0]<stdout>:Step #60\tLoss: 13.702780\n", |
325 | | - "[1,0]<stdout>:Step #70\tLoss: 13.522057\n", |
326 | | - "[1,0]<stdout>:Step #80\tLoss: 13.382860\n", |
327 | | - "[1,0]<stdout>:Step #90\tLoss: 13.701270\n", |
328 | | - "[1,0]<stdout>:Step #100\tLoss: 13.240610\n", |
329 | | - "[1,0]<stdout>:Step #110\tLoss: 13.264977\n", |
330 | | - "[1,0]<stdout>:Step #120\tLoss: 13.984927\n", |
331 | | - "[1,0]<stdout>:Step #130\tLoss: 14.039978\n", |
332 | | - "[1,0]<stdout>:Step #140\tLoss: 13.639907\n", |
333 | | - "[1,0]<stdout>:Step #150\tLoss: 13.430090\n", |
334 | | - "[1,0]<stdout>:Step #160\tLoss: 13.219415\n", |
335 | | - "[1,0]<stdout>:Step #170\tLoss: 12.758451\n", |
336 | | - "[1,0]<stdout>:Step #180\tLoss: 13.592442\n" |
337 | | - ] |
338 | | - } |
339 | | - ], |
| 277 | + "outputs": [], |
340 | 278 | "source": [ |
341 | | - "!horovodrun -np {GPU_COUNT} python tf_trainer.py --data_path={DATA_PATH} --batch_size=65536" |
| 279 | + "! horovodrun -np {GPU_COUNT} python tf_trainer.py --data_path={DATA_PATH} --batch_size=65536" |
342 | 280 | ] |
343 | 281 | }, |
344 | 282 | { |
|
0 commit comments