|
8 | 8 | "# Using Sherlock out-of-the-box\n", |
9 | 9 | "This notebook shows how to predict a semantic type for a given table column.\n", |
10 | 10 | "The steps are basically:\n", |
11 | | - "- Extract features from a column.\n", |
| 11 | + "- Download files for word embedding and paragraph vector feature extraction (downloads only once) and initialize feature extraction models.\n", |
| 12 | + "- Extract features from table columns.\n", |
12 | 13 | "- Initialize Sherlock.\n", |
13 | 14 | "- Make a prediction for the feature representation of the column." |
14 | 15 | ] |
|
44 | 45 | "metadata": {}, |
45 | 46 | "outputs": [ |
46 | 47 | { |
47 | | - "name": "stderr", |
48 | | - "output_type": "stream", |
49 | | - "text": [ |
50 | | - "UsageError: Environment does not have key: PYTHONHASHSEED\n" |
51 | | - ] |
| 48 | + "data": { |
| 49 | + "text/plain": [ |
| 50 | + "'13'" |
| 51 | + ] |
| 52 | + }, |
| 53 | + "execution_count": 2, |
| 54 | + "metadata": {}, |
| 55 | + "output_type": "execute_result" |
52 | 56 | } |
53 | 57 | ], |
54 | 58 | "source": [ |
|
57 | 61 | }, |
58 | 62 | { |
59 | 63 | "cell_type": "markdown", |
60 | | - "id": "2b3b7967", |
| 64 | + "id": "f1101303", |
61 | 65 | "metadata": {}, |
62 | 66 | "source": [ |
63 | | - "## Extract features" |
64 | | - ] |
65 | | - }, |
66 | | - { |
67 | | - "cell_type": "code", |
68 | | - "execution_count": 8, |
69 | | - "id": "164f74ff", |
70 | | - "metadata": {}, |
71 | | - "outputs": [], |
72 | | - "source": [ |
73 | | - "# helpers.download_data()" |
| 67 | + "## Initialize feature extraction models" |
74 | 68 | ] |
75 | 69 | }, |
76 | 70 | { |
|
93 | 87 | " \n", |
94 | 88 | "All files for extracting word and paragraph embeddings are present.\n", |
95 | 89 | "Initialising word embeddings\n", |
96 | | - "Initialise Word Embeddings process took 0:00:05.607905 seconds.\n", |
97 | | - "Initialise Doc2Vec Model, 400 dim, process took 0:00:02.443327 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)\n", |
98 | | - "Initialised NLTK, process took 0:00:00.181374 seconds.\n" |
| 90 | + "Initialise Word Embeddings process took 0:00:05.513540 seconds.\n", |
| 91 | + "Initialise Doc2Vec Model, 400 dim, process took 0:00:04.191875 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)\n", |
| 92 | + "Initialised NLTK, process took 0:00:00.209930 seconds.\n" |
99 | 93 | ] |
100 | 94 | }, |
101 | 95 | { |
|
117 | 111 | "initialise_nltk()" |
118 | 112 | ] |
119 | 113 | }, |
| 114 | + { |
| 115 | + "cell_type": "markdown", |
| 116 | + "id": "2b3b7967", |
| 117 | + "metadata": {}, |
| 118 | + "source": [ |
| 119 | + "## Extract features" |
| 120 | + ] |
| 121 | + }, |
120 | 122 | { |
121 | 123 | "cell_type": "code", |
122 | | - "execution_count": 35, |
| 124 | + "execution_count": 4, |
123 | 125 | "id": "db04ccf9", |
124 | 126 | "metadata": {}, |
125 | 127 | "outputs": [], |
|
128 | 130 | " [\n", |
129 | 131 | " [\"Jane Smith\", \"Lute Ahorn\", \"Anna James\"],\n", |
130 | 132 | " [\"Amsterdam\", \"Haarlem\", \"Zwolle\"],\n", |
| 133 | + " [\"Chabot Street 19\", \"1200 fifth Avenue\", \"Binnenkant 22, 1011BH\"]\n", |
131 | 134 | " ],\n", |
132 | 135 | " name=\"values\"\n", |
133 | 136 | ")" |
134 | 137 | ] |
135 | 138 | }, |
136 | 139 | { |
137 | 140 | "cell_type": "code", |
138 | | - "execution_count": 36, |
| 141 | + "execution_count": 5, |
139 | 142 | "id": "4875f6c7", |
140 | 143 | "metadata": {}, |
141 | 144 | "outputs": [ |
142 | 145 | { |
143 | 146 | "data": { |
144 | 147 | "text/plain": [ |
145 | | - "0 [Jane Smith, Lute Ahorn, Anna James]\n", |
146 | | - "1 [Amsterdam, Haarlem, Zwolle]\n", |
| 148 | + "0 [Jane Smith, Lute Ahorn, Anna James]\n", |
| 149 | + "1 [Amsterdam, Haarlem, Zwolle]\n", |
| 150 | + "2 [Chabot Street 19, 1200 fifth Avenue, Binnenka...\n", |
147 | 151 | "Name: values, dtype: object" |
148 | 152 | ] |
149 | 153 | }, |
150 | | - "execution_count": 36, |
| 154 | + "execution_count": 5, |
151 | 155 | "metadata": {}, |
152 | 156 | "output_type": "execute_result" |
153 | 157 | } |
|
158 | 162 | }, |
159 | 163 | { |
160 | 164 | "cell_type": "code", |
161 | | - "execution_count": 37, |
| 165 | + "execution_count": 8, |
162 | 166 | "id": "f7f2c846", |
163 | 167 | "metadata": {}, |
164 | 168 | "outputs": [ |
165 | 169 | { |
166 | 170 | "name": "stderr", |
167 | 171 | "output_type": "stream", |
168 | 172 | "text": [ |
169 | | - "Extracting Features: 100%|██████████| 2/2 [00:00<00:00, 62.37it/s]\n" |
| 173 | + "Extracting Features: 100%|██████████| 3/3 [00:00<00:00, 167.51it/s]" |
170 | 174 | ] |
171 | 175 | }, |
172 | 176 | { |
|
175 | 179 | "text": [ |
176 | 180 | "Exporting 1588 column features\n" |
177 | 181 | ] |
| 182 | + }, |
| 183 | + { |
| 184 | + "name": "stderr", |
| 185 | + "output_type": "stream", |
| 186 | + "text": [ |
| 187 | + "\n" |
| 188 | + ] |
178 | 189 | } |
179 | 190 | ], |
180 | 191 | "source": [ |
181 | 192 | "extract_features(\n", |
182 | 193 | " \"../temporary.csv\",\n", |
183 | 194 | " data\n", |
184 | 195 | ")\n", |
185 | | - "feature_vector = pd.read_csv(\"../temporary.csv\", dtype=np.float32)" |
| 196 | + "feature_vectors = pd.read_csv(\"../temporary.csv\", dtype=np.float32)" |
186 | 197 | ] |
187 | 198 | }, |
188 | 199 | { |
189 | 200 | "cell_type": "code", |
190 | | - "execution_count": 38, |
| 201 | + "execution_count": 9, |
191 | 202 | "id": "0c42ce71", |
192 | 203 | "metadata": {}, |
193 | 204 | "outputs": [ |
|
241 | 252 | " <td>0.0</td>\n", |
242 | 253 | " <td>0.0</td>\n", |
243 | 254 | " <td>0.0</td>\n", |
244 | | - " <td>0.0</td>\n", |
| 255 | + " <td>0.000000</td>\n", |
245 | 256 | " <td>0.0</td>\n", |
246 | 257 | " <td>0.0</td>\n", |
247 | 258 | " <td>0.0</td>\n", |
248 | 259 | " <td>0.0</td>\n", |
249 | 260 | " <td>-3.0</td>\n", |
250 | 261 | " <td>0.0</td>\n", |
251 | 262 | " <td>...</td>\n", |
252 | | - " <td>-0.115819</td>\n", |
253 | | - " <td>0.023961</td>\n", |
254 | | - " <td>-0.130739</td>\n", |
255 | | - " <td>0.006393</td>\n", |
256 | | - " <td>-0.135118</td>\n", |
257 | | - " <td>-0.071956</td>\n", |
258 | | - " <td>-0.051051</td>\n", |
259 | | - " <td>-0.068307</td>\n", |
260 | | - " <td>0.087342</td>\n", |
261 | | - " <td>-0.145716</td>\n", |
| 263 | + " <td>-0.116468</td>\n", |
| 264 | + " <td>0.023982</td>\n", |
| 265 | + " <td>-0.130867</td>\n", |
| 266 | + " <td>0.006825</td>\n", |
| 267 | + " <td>-0.135098</td>\n", |
| 268 | + " <td>-0.070616</td>\n", |
| 269 | + " <td>-0.052172</td>\n", |
| 270 | + " <td>-0.067250</td>\n", |
| 271 | + " <td>0.086256</td>\n", |
| 272 | + " <td>-0.144385</td>\n", |
262 | 273 | " </tr>\n", |
263 | 274 | " <tr>\n", |
264 | 275 | " <th>1</th>\n", |
265 | 276 | " <td>0.0</td>\n", |
266 | 277 | " <td>0.0</td>\n", |
267 | 278 | " <td>0.0</td>\n", |
268 | | - " <td>0.0</td>\n", |
| 279 | + " <td>0.000000</td>\n", |
269 | 280 | " <td>0.0</td>\n", |
270 | 281 | " <td>0.0</td>\n", |
271 | 282 | " <td>0.0</td>\n", |
272 | 283 | " <td>0.0</td>\n", |
273 | 284 | " <td>-3.0</td>\n", |
274 | 285 | " <td>0.0</td>\n", |
275 | 286 | " <td>...</td>\n", |
276 | | - " <td>-0.054351</td>\n", |
277 | | - " <td>0.023650</td>\n", |
278 | | - " <td>-0.165681</td>\n", |
279 | | - " <td>-0.016137</td>\n", |
280 | | - " <td>-0.059402</td>\n", |
281 | | - " <td>0.008454</td>\n", |
282 | | - " <td>-0.044624</td>\n", |
283 | | - " <td>0.025160</td>\n", |
284 | | - " <td>0.037831</td>\n", |
285 | | - " <td>-0.086235</td>\n", |
| 287 | + " <td>-0.054949</td>\n", |
| 288 | + " <td>0.024502</td>\n", |
| 289 | + " <td>-0.166001</td>\n", |
| 290 | + " <td>-0.014375</td>\n", |
| 291 | + " <td>-0.058199</td>\n", |
| 292 | + " <td>0.009978</td>\n", |
| 293 | + " <td>-0.046423</td>\n", |
| 294 | + " <td>0.025163</td>\n", |
| 295 | + " <td>0.036946</td>\n", |
| 296 | + " <td>-0.086611</td>\n", |
| 297 | + " </tr>\n", |
| 298 | + " <tr>\n", |
| 299 | + " <th>2</th>\n", |
| 300 | + " <td>1.0</td>\n", |
| 301 | + " <td>0.0</td>\n", |
| 302 | + " <td>1.0</td>\n", |
| 303 | + " <td>0.666667</td>\n", |
| 304 | + " <td>0.0</td>\n", |
| 305 | + " <td>2.0</td>\n", |
| 306 | + " <td>1.0</td>\n", |
| 307 | + " <td>3.0</td>\n", |
| 308 | + " <td>-1.5</td>\n", |
| 309 | + " <td>0.0</td>\n", |
| 310 | + " <td>...</td>\n", |
| 311 | + " <td>-0.022804</td>\n", |
| 312 | + " <td>0.001741</td>\n", |
| 313 | + " <td>0.047479</td>\n", |
| 314 | + " <td>0.118293</td>\n", |
| 315 | + " <td>-0.093435</td>\n", |
| 316 | + " <td>0.036759</td>\n", |
| 317 | + " <td>-0.004508</td>\n", |
| 318 | + " <td>-0.087898</td>\n", |
| 319 | + " <td>-0.117796</td>\n", |
| 320 | + " <td>-0.191386</td>\n", |
286 | 321 | " </tr>\n", |
287 | 322 | " </tbody>\n", |
288 | 323 | "</table>\n", |
289 | | - "<p>2 rows × 1588 columns</p>\n", |
| 324 | + "<p>3 rows × 1588 columns</p>\n", |
290 | 325 | "</div>" |
291 | 326 | ], |
292 | 327 | "text/plain": [ |
293 | 328 | " n_[0]-agg-any n_[0]-agg-all n_[0]-agg-mean n_[0]-agg-var n_[0]-agg-min \\\n", |
294 | | - "0 0.0 0.0 0.0 0.0 0.0 \n", |
295 | | - "1 0.0 0.0 0.0 0.0 0.0 \n", |
| 329 | + "0 0.0 0.0 0.0 0.000000 0.0 \n", |
| 330 | + "1 0.0 0.0 0.0 0.000000 0.0 \n", |
| 331 | + "2 1.0 0.0 1.0 0.666667 0.0 \n", |
296 | 332 | "\n", |
297 | 333 | " n_[0]-agg-max n_[0]-agg-median n_[0]-agg-sum n_[0]-agg-kurtosis \\\n", |
298 | 334 | "0 0.0 0.0 0.0 -3.0 \n", |
299 | 335 | "1 0.0 0.0 0.0 -3.0 \n", |
| 336 | + "2 2.0 1.0 3.0 -1.5 \n", |
300 | 337 | "\n", |
301 | 338 | " n_[0]-agg-skewness ... par_vec_390 par_vec_391 par_vec_392 \\\n", |
302 | | - "0 0.0 ... -0.115819 0.023961 -0.130739 \n", |
303 | | - "1 0.0 ... -0.054351 0.023650 -0.165681 \n", |
| 339 | + "0 0.0 ... -0.116468 0.023982 -0.130867 \n", |
| 340 | + "1 0.0 ... -0.054949 0.024502 -0.166001 \n", |
| 341 | + "2 0.0 ... -0.022804 0.001741 0.047479 \n", |
304 | 342 | "\n", |
305 | 343 | " par_vec_393 par_vec_394 par_vec_395 par_vec_396 par_vec_397 \\\n", |
306 | | - "0 0.006393 -0.135118 -0.071956 -0.051051 -0.068307 \n", |
307 | | - "1 -0.016137 -0.059402 0.008454 -0.044624 0.025160 \n", |
| 344 | + "0 0.006825 -0.135098 -0.070616 -0.052172 -0.067250 \n", |
| 345 | + "1 -0.014375 -0.058199 0.009978 -0.046423 0.025163 \n", |
| 346 | + "2 0.118293 -0.093435 0.036759 -0.004508 -0.087898 \n", |
308 | 347 | "\n", |
309 | 348 | " par_vec_398 par_vec_399 \n", |
310 | | - "0 0.087342 -0.145716 \n", |
311 | | - "1 0.037831 -0.086235 \n", |
| 349 | + "0 0.086256 -0.144385 \n", |
| 350 | + "1 0.036946 -0.086611 \n", |
| 351 | + "2 -0.117796 -0.191386 \n", |
312 | 352 | "\n", |
313 | | - "[2 rows x 1588 columns]" |
| 353 | + "[3 rows x 1588 columns]" |
314 | 354 | ] |
315 | 355 | }, |
316 | | - "execution_count": 38, |
| 356 | + "execution_count": 9, |
317 | 357 | "metadata": {}, |
318 | 358 | "output_type": "execute_result" |
319 | 359 | } |
320 | 360 | ], |
321 | 361 | "source": [ |
322 | | - "feature_vector" |
| 362 | + "feature_vectors" |
323 | 363 | ] |
324 | 364 | }, |
325 | | - { |
326 | | - "cell_type": "code", |
327 | | - "execution_count": null, |
328 | | - "id": "52047a6b", |
329 | | - "metadata": {}, |
330 | | - "outputs": [], |
331 | | - "source": [] |
332 | | - }, |
333 | 365 | { |
334 | 366 | "cell_type": "code", |
335 | 367 | "execution_count": null, |
|
343 | 375 | "id": "9027fa4a", |
344 | 376 | "metadata": {}, |
345 | 377 | "source": [ |
346 | | - "## Initialize Sherlock." |
| 378 | + "## Initialize Sherlock" |
347 | 379 | ] |
348 | 380 | }, |
349 | 381 | { |
350 | 382 | "cell_type": "code", |
351 | | - "execution_count": 39, |
| 383 | + "execution_count": 11, |
352 | 384 | "id": "b9ec13ec", |
353 | 385 | "metadata": {}, |
354 | 386 | "outputs": [], |
355 | 387 | "source": [ |
356 | 388 | "model = SherlockModel();\n", |
357 | | - "model.initialize_model_from_json(with_weights=True);" |
| 389 | + "model.initialize_model_from_json(with_weights=True, model_id=\"sherlock\");" |
358 | 390 | ] |
359 | 391 | }, |
360 | 392 | { |
|
375 | 407 | }, |
376 | 408 | { |
377 | 409 | "cell_type": "code", |
378 | | - "execution_count": 40, |
| 410 | + "execution_count": 12, |
379 | 411 | "id": "fc079fa9", |
380 | 412 | "metadata": {}, |
381 | 413 | "outputs": [], |
382 | 414 | "source": [ |
383 | | - "predicted_labels = model.predict(feature_vector, \"sherlock\")" |
| 415 | + "predicted_labels = model.predict(feature_vectors, \"sherlock\")" |
384 | 416 | ] |
385 | 417 | }, |
386 | 418 | { |
387 | 419 | "cell_type": "code", |
388 | | - "execution_count": 41, |
| 420 | + "execution_count": 13, |
389 | 421 | "id": "0feb9584", |
390 | 422 | "metadata": {}, |
391 | 423 | "outputs": [ |
392 | 424 | { |
393 | 425 | "data": { |
394 | 426 | "text/plain": [ |
395 | | - "array(['creator', 'city'], dtype=object)" |
| 427 | + "array(['person', 'city', 'address'], dtype=object)" |
396 | 428 | ] |
397 | 429 | }, |
398 | | - "execution_count": 41, |
| 430 | + "execution_count": 13, |
399 | 431 | "metadata": {}, |
400 | 432 | "output_type": "execute_result" |
401 | 433 | } |
|
0 commit comments