From 55549a1746cb0c0eb2aa79c9d60c71481d4c5c55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Ma=C4=8Duda?= Date: Fri, 7 Dec 2018 09:49:38 +0100 Subject: [PATCH 1/7] registrations of the stopwords files outside of the lib directory --- Readme.md | 2 +- lib/lda.js | 17 +++++++++++- package.json | 2 +- test4.js | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 95 insertions(+), 3 deletions(-) create mode 100644 test4.js diff --git a/Readme.md b/Readme.md index 6d8e727..9c4538b 100644 --- a/Readme.md +++ b/Readme.md @@ -95,7 +95,7 @@ result = lda(documents, 2, 5, ['de']); result = lda(documents, 2, 5, ['en', 'de']); ``` -To add a new language-specific stop-words list, create a file /lda/lib/stopwords_XX.js where XX is the id for the language. For example, a French stop-words list could be named "stopwords_fr.js". The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is, as follows: +To add a new language-specific stop-words list, register a file to the specific language. For example, a French stop-words register the language `lda.registerStopWords('fr', '/path/to/the/french/stopwords.js')`. The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is, as follows: ```javascript exports.stop_words = [ diff --git a/lib/lda.js b/lib/lda.js index 081e854..4d795ab 100644 --- a/lib/lda.js +++ b/lib/lda.js @@ -1,5 +1,11 @@ var stem = require('stem-porter'); +var STOP_WORDS_MAP = { + en: './stopwords_en.js', + de: './stopwords_de.js', + es: './stopwords_es.js', +}; + // // Based on javascript implementation https://github.com/awaisathar/lda.js // Original code based on http://www.arbylon.net/projects/LdaGibbsSampler.java @@ -22,7 +28,11 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag var stopwords = new Array(); languages.forEach(function(value) { - var stopwordsLang = require('./stopwords_' + value + ".js"); + var path = STOP_WORDS_MAP[value]; + if (!path) { + return; + } + var stopwordsLang = require(path); stopwords = stopwords.concat(stopwordsLang.stop_words); }); @@ -99,6 +109,11 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag return result; } +process.registerStopwords = function(language, path) { + STOP_WORDS_MAP[language] = path; + return this; +}; + function makeArray(x) { var a = new Array(); for (var i=0;i= 0.8.x" diff --git a/test4.js b/test4.js new file mode 100644 index 0000000..150fc59 --- /dev/null +++ b/test4.js @@ -0,0 +1,77 @@ +const lda = require('./lib/lda'); +const path = require('path'); + +lda.registerStopwords('en_override', path.resolve(__dirname, './lib/stopwords_en.js')); + +const collection = [ + [ + 'Ruby slippers are pretty and fun.', + 'Long walks in the park are fun.', + '', + 'Slippers are soft on your feet.' + ], + [ + 'Ruby slippers are pretty and fun.', + 'Long walks in the park are fun.', + null, + 'Slippers are soft on your feet.' + ], + [ + '', + 'Ruby slippers are pretty and fun.', + 'Long walks in the park are fun.', + 'Slippers are soft on your feet.' + ], + [ + null, + 'Ruby slippers are pretty and fun.', + 'Long walks in the park are fun.', + 'Slippers are soft on your feet.' + ], + [ + 'Ruby slippers are pretty and fun.', + 'Long walks in the park are fun.', + 'Slippers are soft on your feet.', + '' + ], + [ + 'Ruby slippers are pretty and fun.', + 'Long walks in the park are fun.', + 'Slippers are soft on your feet.', + null + ] +]; + +var probabilities = []; + +collection.forEach((documents, i) => { + const results = lda(documents, 3, 2, ['en_override'], null, null, 123); + + // Save the probabilities for each group. The values should be the same, since we're using the same random seed. + const groupProbs = []; + results.forEach(group => { + group.forEach(row => { + groupProbs.push(row.probability); + }); + }); + + // Store the entire group in an array. + probabilities.push(groupProbs); + + //console.log('\nSet ' + (i + 1)); + //console.log(results); +}); + +var success = true; + +// Verify the probabilities for each group are the same, even with empty and null values in the docs. +probabilities.forEach((group, i) => { + if (group[0] !== 0.15 || group[1] !== 0.14 || group[2] !== 0.16 || group[3] !== 0.15 || group[4] !== 0.16 || group[5] !== 0.14) { + console.log('Failed expected values for group ' + i); + success = false; + } +}); + +if (success) { + console.log('\nResult OK.'); +} \ No newline at end of file From b85cd49536bafdde23e67c3f968b37903ea11894 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Ma=C4=8Duda?= Date: Fri, 7 Dec 2018 09:53:56 +0100 Subject: [PATCH 2/7] update of the readme --- Readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Readme.md b/Readme.md index 9c4538b..1a3f214 100644 --- a/Readme.md +++ b/Readme.md @@ -95,7 +95,7 @@ result = lda(documents, 2, 5, ['de']); result = lda(documents, 2, 5, ['en', 'de']); ``` -To add a new language-specific stop-words list, register a file to the specific language. For example, a French stop-words register the language `lda.registerStopWords('fr', '/path/to/the/french/stopwords.js')`. The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is, as follows: +To add a new language-specific stop-words list, register a file for the specific language. For example, a French stop-words register the language `lda.registerStopWords('fr', '/path/to/the/french/stopwords.js')`. The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is, as follows: ```javascript exports.stop_words = [ From a22a8f298f7f0663da163c4df12af4e8f1bdda54 Mon Sep 17 00:00:00 2001 From: Kory Becker Date: Mon, 15 Jul 2019 15:12:45 -0400 Subject: [PATCH 3/7] Updated documentation for languages. --- Readme.md | 14 ++++++--- package.json | 2 +- test5.js | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 5 deletions(-) create mode 100644 test5.js diff --git a/Readme.md b/Readme.md index 1a3f214..b3ac774 100644 --- a/Readme.md +++ b/Readme.md @@ -69,20 +69,20 @@ var result = lda(documents, 2, 5); for (var i in result) { var row = result[i]; console.log('Topic ' + (parseInt(i) + 1)); - + // For each term. for (var j in row) { var term = row[j]; console.log(term.term + ' (' + term.probability + '%)'); } - + console.log(''); } ``` ## Additional Languages -LDA uses [stop-words](https://en.wikipedia.org/wiki/Stop_words) to ignore common terms in the text (for example: this, that, it, we). By default, the stop-words list uses English. To use additional languages, you can specify an array of language ids, as follows: +LDA uses [stop-words](https://en.wikipedia.org/wiki/Stop_words) to ignore common terms in the text (for example: this, that, it, we). By default, the stop-words list uses English. To use additional languages, you can specify an array of language ids, as follows: ```javascript // Use English (this is the default). @@ -95,7 +95,13 @@ result = lda(documents, 2, 5, ['de']); result = lda(documents, 2, 5, ['en', 'de']); ``` -To add a new language-specific stop-words list, register a file for the specific language. For example, a French stop-words register the language `lda.registerStopWords('fr', '/path/to/the/french/stopwords.js')`. The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is, as follows: +To add a new language-specific stop-words list, register a file for the specific language. For example, to register a French stop-words list use the following code. + +```js +lda.registerStopWords('fr', '/path/to/the/french/stopwords.js') +``` + +The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is shown below. ```javascript exports.stop_words = [ diff --git a/package.json b/package.json index c97b6a1..07ca2e6 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "lda", - "version": "0.2.0", + "version": "0.3.0", "description": "LDA topic modeling for node.js.", "author": { "name": "Kory Becker", diff --git a/test5.js b/test5.js new file mode 100644 index 0000000..76a97ed --- /dev/null +++ b/test5.js @@ -0,0 +1,85 @@ +var lda = require('./lib/lda'); + +var text = 'Hola, tu estas muy ocupada hoy? Esta bonita afuera, pero hace un poco de calor hoy. Tu tienes algo? Tu quieres ir a el banco? Tu puedes comprar algo aqui con tu dinero.'; +var documents = text.match( /[^\.!\?]+[\.!\?]+/g ); + +var result_en = lda(documents, 2, 5, ['en'], null, null, 123); +var result_es = lda(documents, 2, 5, ['es'], null, null, 123); +var result_multi = lda(documents, 2, 5, ['invalid1', 'en', 'es', 'invalid2'], null, null, 123); + +var findTerm = function(term, topics) { + for (var i in topics) { + var row = topics[i]; + console.log('Topic ' + (parseInt(i) + 1)); + + // For each term. + for (var j in row) { + var aterm = row[j]; + console.log(aterm.term + ' (' + aterm.probability + '%)'); + + if (aterm.term === term) { + console.log('*** Found ' + term); + return term; + } + } + + console.log(''); + } + + return null; +}; + +// For each topic. +var success = true; +var target_term = 'tu'; // Stop-words term that should be removed when using the designated stop-words list (i.e., spanish). + +// Look for the stop-word in the resulting topics using English and Spanish. The term should exist in English, but not in Spanish. +console.log('Using English stop-words.'); +var result1 = findTerm(target_term, result_en); +if (!result1) { + console.log('\nFailed English stop-words check. Failed to find expected stop-word: "' + target_term + '" as a topic.') + success = false; +} + +console.log('\nUsing Spanish stop-words.'); +var result2 = findTerm(target_term, result_es) +if (result2) { + console.log('\nFailed Spanish stop-words check. Found stop-word: "' + target_term + '" as a topic, when it should have been removed.') + success = false; +} + +console.log('\nUsing English and Spanish stop-words.'); +var result3 = findTerm(target_term, result_multi); +if (result3) { + console.log('\nFailed English, Spanish, invalid stop-words check. Found stop-word: "' + target_term + '" as a topic, when it should have been removed.') + success = false; +} + +// Confirm the probabilities are equal when using the Spanish stop-words list and a list containing Spanish and invalid stop-word paths. +const groupProbs1 = []; +result_es.forEach(group => { + group.forEach(row => { + groupProbs1.push(row.probability); + }); +}); + +const groupProbs2 = []; +result_multi.forEach(group => { + group.forEach(row => { + groupProbs2.push(row.probability); + }); +}); + +for (var i=0; i Date: Mon, 15 Jul 2019 15:20:29 -0400 Subject: [PATCH 4/7] Cleanup. --- test5.js | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/test5.js b/test5.js index 76a97ed..30a0aaf 100644 --- a/test5.js +++ b/test5.js @@ -38,21 +38,21 @@ console.log('Using English stop-words.'); var result1 = findTerm(target_term, result_en); if (!result1) { console.log('\nFailed English stop-words check. Failed to find expected stop-word: "' + target_term + '" as a topic.') - success = false; + return; } console.log('\nUsing Spanish stop-words.'); var result2 = findTerm(target_term, result_es) if (result2) { console.log('\nFailed Spanish stop-words check. Found stop-word: "' + target_term + '" as a topic, when it should have been removed.') - success = false; + return; } console.log('\nUsing English and Spanish stop-words.'); var result3 = findTerm(target_term, result_multi); if (result3) { console.log('\nFailed English, Spanish, invalid stop-words check. Found stop-word: "' + target_term + '" as a topic, when it should have been removed.') - success = false; + return; } // Confirm the probabilities are equal when using the Spanish stop-words list and a list containing Spanish and invalid stop-word paths. @@ -75,8 +75,7 @@ for (var i=0; i Date: Mon, 15 Jul 2019 15:23:13 -0400 Subject: [PATCH 5/7] Wording. --- test5.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test5.js b/test5.js index 30a0aaf..c533393 100644 --- a/test5.js +++ b/test5.js @@ -51,7 +51,7 @@ if (result2) { console.log('\nUsing English and Spanish stop-words.'); var result3 = findTerm(target_term, result_multi); if (result3) { - console.log('\nFailed English, Spanish, invalid stop-words check. Found stop-word: "' + target_term + '" as a topic, when it should have been removed.') + console.log('\nFailed Multiple stop-words check. Found stop-word: "' + target_term + '" as a topic, when it should have been removed.') return; } From 6ea16fb15a88ad84fda179b5f052f49adb609222 Mon Sep 17 00:00:00 2001 From: Kory Becker Date: Mon, 15 Jul 2019 21:33:23 -0400 Subject: [PATCH 6/7] Added backwards compatibility for languages already existing with filename stopwords_xx.js. --- Readme.md | 2 +- lib/lda.js | 62 ++++++++++++++++++-------------- test5.js | 14 ++++---- test6.js | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 147 insertions(+), 34 deletions(-) create mode 100644 test6.js diff --git a/Readme.md b/Readme.md index b3ac774..72ca6ba 100644 --- a/Readme.md +++ b/Readme.md @@ -98,7 +98,7 @@ result = lda(documents, 2, 5, ['en', 'de']); To add a new language-specific stop-words list, register a file for the specific language. For example, to register a French stop-words list use the following code. ```js -lda.registerStopWords('fr', '/path/to/the/french/stopwords.js') +lda.registerStopwords('fr', '/path/to/the/french/stopwords.js') ``` The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is shown below. diff --git a/lib/lda.js b/lib/lda.js index 4d795ab..4f19228 100644 --- a/lib/lda.js +++ b/lib/lda.js @@ -28,11 +28,21 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag var stopwords = new Array(); languages.forEach(function(value) { + var stopwordsLang; + var path = STOP_WORDS_MAP[value]; if (!path) { - return; + // Try loading the file directly. + try { + stopwordsLang = require('./stopwords_' + value + ".js"); + } + catch { + console.log('Warning: Ignoring invalid stop-word list "' + value + '". Please register your stop-words file using: lda.registerStopwords(\'' + value + '\', \'/path/to/stopwords_' + value + '.js\')'); + return; + } } - var stopwordsLang = require(path); + + stopwordsLang = stopwordsLang || require(path); stopwords = stopwords.concat(stopwordsLang.stop_words); }); @@ -47,15 +57,15 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag var w=words[wc].toLowerCase().replace(/[^a-z\'A-Z0-9\u00C0-\u00ff ]+/g, ''); var wStemmed = stem(w); if (w=="" || !wStemmed || w.length==1 || stopwords.indexOf(w.replace("'", "")) > -1 || stopwords.indexOf(wStemmed) > -1 || w.indexOf("http")==0) continue; - if (f[wStemmed]) { + if (f[wStemmed]) { f[wStemmed]=f[wStemmed]+1; - } - else if(wStemmed) { - f[wStemmed]=1; + } + else if(wStemmed) { + f[wStemmed]=1; vocab.push(wStemmed); vocabOrig[wStemmed] = w; }; - + documents[i].push(vocab.indexOf(wStemmed)); } } @@ -88,14 +98,14 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag //console.log('Topic ' + (k + 1)); var row = []; - + for (var t = 0; t < topTerms; t++) { var topicTerm=things[t].split("_")[2]; var prob=parseInt(things[t].split("_")[0]*100); if (prob<2) continue; - + //console.log('Top Term: ' + topicTerm + ' (' + prob + '%)'); - + var term = {}; term.term = topicTerm; term.probability = parseFloat(things[t].split("_")[0]); @@ -105,7 +115,7 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag result.push(row); } } - + return result; } @@ -115,7 +125,7 @@ process.registerStopwords = function(language, path) { }; function makeArray(x) { - var a = new Array(); + var a = new Array(); for (var i=0;i this.BURN_IN) && (this.SAMPLE_LAG > 0) && (i % this.SAMPLE_LAG == 0)) { this.updateParams(); - //document.write("|"); + //document.write("|"); if (i % this.THIN_INTERVAL != 0) this.dispcol++; } if (this.dispcol >= 100) { - //document.write("*
"); + //document.write("*
"); this.dispcol = 0; } } } - + this.sampleFullConditional = function(m,n) { var topic = this.z[m][n]; this.nw[this.documents[m][n]][topic]--; @@ -241,7 +251,7 @@ var lda = new function() { this.ndsum[m]++; return topic; } - + this.updateParams =function () { for (var m = 0; m < this.documents.length; m++) { for (var k = 0; k < this.K; k++) { @@ -255,7 +265,7 @@ var lda = new function() { } this.numstats++; } - + this.getTheta = function() { var theta = new Array(); for(var i=0;i 0) { @@ -273,7 +283,7 @@ var lda = new function() { } return theta; } - + this.getPhi = function () { var phi = new Array(); for(var i=0;i 0) { diff --git a/test5.js b/test5.js index c533393..f91cc36 100644 --- a/test5.js +++ b/test5.js @@ -10,15 +10,15 @@ var result_multi = lda(documents, 2, 5, ['invalid1', 'en', 'es', 'invalid2'], nu var findTerm = function(term, topics) { for (var i in topics) { var row = topics[i]; - console.log('Topic ' + (parseInt(i) + 1)); + //console.log('Topic ' + (parseInt(i) + 1)); // For each term. for (var j in row) { var aterm = row[j]; - console.log(aterm.term + ' (' + aterm.probability + '%)'); + //console.log(aterm.term + ' (' + aterm.probability + '%)'); if (aterm.term === term) { - console.log('*** Found ' + term); + console.log('Found "' + term + '"'); return term; } } @@ -37,21 +37,21 @@ var target_term = 'tu'; // Stop-words term that should be removed when using the console.log('Using English stop-words.'); var result1 = findTerm(target_term, result_en); if (!result1) { - console.log('\nFailed English stop-words check. Failed to find expected stop-word: "' + target_term + '" as a topic.') + console.log('\nFailed English stop-words check! Failed to find expected stop-word: "' + target_term + '" as a topic.') return; } console.log('\nUsing Spanish stop-words.'); var result2 = findTerm(target_term, result_es) if (result2) { - console.log('\nFailed Spanish stop-words check. Found stop-word: "' + target_term + '" as a topic, when it should have been removed.') + console.log('\nFailed Spanish stop-words check! Found stop-word: "' + target_term + '" as a topic, when it should have been removed.') return; } console.log('\nUsing English and Spanish stop-words.'); var result3 = findTerm(target_term, result_multi); if (result3) { - console.log('\nFailed Multiple stop-words check. Found stop-word: "' + target_term + '" as a topic, when it should have been removed.') + console.log('\nFailed Multiple stop-words check! Found stop-word: "' + target_term + '" as a topic, when it should have been removed.') return; } @@ -72,7 +72,7 @@ result_multi.forEach(group => { for (var i=0; i Date: Mon, 15 Jul 2019 21:35:27 -0400 Subject: [PATCH 7/7] Added context. --- test6.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test6.js b/test6.js index 533255c..4e686c6 100644 --- a/test6.js +++ b/test6.js @@ -34,7 +34,7 @@ var success1 = false; var success2 = false; var success3 = false; -console.log('Test 1: Run lda with the default stop-words list. Ignore warning.'); +console.log('Test 1: Run lda without a custom stop-words list. Ignore warning.'); results = lda(documents, 2, 5, ['custom_lang'], null, null, 123); @@ -49,7 +49,7 @@ else { return; } -console.log('\nTest 2: Run lda with a default stop-words list copied into the lib folder.'); +console.log('\nTest 2: Run lda with a custom stop-words list copied into the lib folder.'); // Copy the language file to a default file in the lib folder. const copyPath = './lib/stopwords_' + filePath.replace('./', ''); @@ -72,7 +72,7 @@ else { // Cleanup. fs.unlinkSync(copyPath); -console.log('\nTest 3: Register the custom stop-words list.'); +console.log('\nTest 3: Register a custom stop-words list programmatically.'); lda.registerStopwords('custom_lang', path.resolve(__dirname, filePath)); results = lda(documents, 2, 5, ['custom_lang'], null, null, 123);