From cebaf805df4d6ac312eb0aefd188c7823b60e1e4 Mon Sep 17 00:00:00 2001 From: Julian Simioni Date: Tue, 22 Feb 2022 12:13:59 -0500 Subject: [PATCH 1/2] feat(dedupe): Check Geonames<->WOF concordances These concordances can be trusted over any other signals and really help us remove lots of bad Geonames data. --- helper/diffPlaces.js | 33 +++++++++++++++++++++++++++++++++ test/unit/helper/diffPlaces.js | 25 +++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/helper/diffPlaces.js b/helper/diffPlaces.js index 37516cc14..abb5682fc 100644 --- a/helper/diffPlaces.js +++ b/helper/diffPlaces.js @@ -4,6 +4,7 @@ const unicode = require('./unicode'); const placeTypes = require('./placeTypes'); const canonicalLayers = require('../helper/type_mapping').getCanonicalLayers(); const field = require('../helper/fieldValue'); +const codec = require('pelias-model').codec; // only consider these layers as synonymous for deduplication purposes. // when performing inter-layer deduping, layers coming earlier in this list take @@ -186,11 +187,43 @@ function isAddressDifferent(item1, item2){ return false; } +function isGeonamesConcordanceSame(item1, item2) { + let wof_record; + let gn_record; + + if (item1.source === 'geonames' && item2.source === 'whosonfirst') { + gn_record = item1; + wof_record = item2; + } else if (item2.source === 'geonames' && item1.source === 'whosonfirst') { + gn_record = item2; + wof_record = item1; + } else { + // could not match to one geonames and one wof concordance, so this check does not apply + return false; + } + + const concordances = _.get(wof_record, 'addendum.concordances'); + + if (concordances) { + const json = codec.decode(concordances); + const concordance_id = json['gn:id']; + + if (concordance_id && typeof concordance_id === 'number' && concordance_id.toString() === gn_record.source_id) { + return true; + } + } + + return false; +} + /** * Compare the two records and return true if they differ and false if same. * Optionally provide $requestLanguage (req.clean.lang.iso6393) to improve name deduplication. */ function isDifferent(item1, item2, requestLanguage){ + // records that share a geonames concordance are the same, regardless of any other checks + if( isGeonamesConcordanceSame( item1, item2 ) ){ return false; } + if( isLayerDifferent( item1, item2 ) ){ return true; } if( isParentHierarchyDifferent( item1, item2 ) ){ return true; } if( isNameDifferent( item1, item2, requestLanguage ) ){ return true; } diff --git a/test/unit/helper/diffPlaces.js b/test/unit/helper/diffPlaces.js index d9d3d9fcb..5910219bb 100644 --- a/test/unit/helper/diffPlaces.js +++ b/test/unit/helper/diffPlaces.js @@ -765,6 +765,31 @@ module.exports.tests.layerDependentNormalization = function (test, common) { }); }; +module.exports.tests.geonames = function (test, common) { + test('geonames record with concordance is the same, regardless of anything else', function(t) { + const gn_record = { + source: 'geonames', + source_id: '123', + name: { + 'default': 'One name' + } + }; + const wof_record = { + source: 'whosonfirst', + source_id: '345', + name: { + default: 'Different name' + }, + addendum: { + concordances: '{ "gn:id": 123 }' + } + }; + + t.false(isDifferent(gn_record, wof_record), 'should be the same based on concordance'); + t.end(); + }); +}; + module.exports.all = function (tape, common) { function test(name, testFunction) { From 533884c5363f74c555770835b623b899870e858b Mon Sep 17 00:00:00 2001 From: Julian Simioni Date: Fri, 4 Mar 2022 11:55:49 -0500 Subject: [PATCH 2/2] Simplify logic --- helper/diffPlaces.js | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/helper/diffPlaces.js b/helper/diffPlaces.js index abb5682fc..537a2d960 100644 --- a/helper/diffPlaces.js +++ b/helper/diffPlaces.js @@ -188,29 +188,28 @@ function isAddressDifferent(item1, item2){ } function isGeonamesConcordanceSame(item1, item2) { - let wof_record; - let gn_record; - - if (item1.source === 'geonames' && item2.source === 'whosonfirst') { - gn_record = item1; - wof_record = item2; - } else if (item2.source === 'geonames' && item1.source === 'whosonfirst') { - gn_record = item2; - wof_record = item1; - } else { - // could not match to one geonames and one wof concordance, so this check does not apply + const items = [item1, item2]; + + const wof_record = items.find(i => i.source === 'whosonfirst'); + const gn_record = items.find(i => i.source === 'geonames'); + + // must have found one wof and one gn record or this check does not apply + if (!wof_record || !gn_record) { return false; } + + const concordances = _.get(wof_record, 'addendum.concordances'); + + if (!concordances) { return false; } - const concordances = _.get(wof_record, 'addendum.concordances'); + const json = codec.decode(concordances); + const concordance_id = json['gn:id']; - if (concordances) { - const json = codec.decode(concordances); - const concordance_id = json['gn:id']; + if (!concordance_id || !_.isNumber(concordance_id)) { return false; } - if (concordance_id && typeof concordance_id === 'number' && concordance_id.toString() === gn_record.source_id) { - return true; - } + // only records with a matching concordance pass this check + if (concordance_id.toString() === gn_record.source_id) { + return true; } return false;