diff --git a/Readme.md b/Readme.md index 6d8e727..72ca6ba 100644 --- a/Readme.md +++ b/Readme.md @@ -69,20 +69,20 @@ var result = lda(documents, 2, 5); for (var i in result) { var row = result[i]; console.log('Topic ' + (parseInt(i) + 1)); - + // For each term. for (var j in row) { var term = row[j]; console.log(term.term + ' (' + term.probability + '%)'); } - + console.log(''); } ``` ## Additional Languages -LDA uses [stop-words](https://en.wikipedia.org/wiki/Stop_words) to ignore common terms in the text (for example: this, that, it, we). By default, the stop-words list uses English. To use additional languages, you can specify an array of language ids, as follows: +LDA uses [stop-words](https://en.wikipedia.org/wiki/Stop_words) to ignore common terms in the text (for example: this, that, it, we). By default, the stop-words list uses English. To use additional languages, you can specify an array of language ids, as follows: ```javascript // Use English (this is the default). @@ -95,7 +95,13 @@ result = lda(documents, 2, 5, ['de']); result = lda(documents, 2, 5, ['en', 'de']); ``` -To add a new language-specific stop-words list, create a file /lda/lib/stopwords_XX.js where XX is the id for the language. For example, a French stop-words list could be named "stopwords_fr.js". The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is, as follows: +To add a new language-specific stop-words list, register a file for the specific language. For example, to register a French stop-words list use the following code. + +```js +lda.registerStopwords('fr', '/path/to/the/french/stopwords.js') +``` + +The contents of the file should follow the format of an [existing](https://github.com/primaryobjects/lda/blob/master/lib/stopwords_en.js) stop-words list. The format is shown below. ```javascript exports.stop_words = [ diff --git a/lib/lda.js b/lib/lda.js index 081e854..4f19228 100644 --- a/lib/lda.js +++ b/lib/lda.js @@ -1,5 +1,11 @@ var stem = require('stem-porter'); +var STOP_WORDS_MAP = { + en: './stopwords_en.js', + de: './stopwords_de.js', + es: './stopwords_es.js', +}; + // // Based on javascript implementation https://github.com/awaisathar/lda.js // Original code based on http://www.arbylon.net/projects/LdaGibbsSampler.java @@ -22,7 +28,21 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag var stopwords = new Array(); languages.forEach(function(value) { - var stopwordsLang = require('./stopwords_' + value + ".js"); + var stopwordsLang; + + var path = STOP_WORDS_MAP[value]; + if (!path) { + // Try loading the file directly. + try { + stopwordsLang = require('./stopwords_' + value + ".js"); + } + catch { + console.log('Warning: Ignoring invalid stop-word list "' + value + '". Please register your stop-words file using: lda.registerStopwords(\'' + value + '\', \'/path/to/stopwords_' + value + '.js\')'); + return; + } + } + + stopwordsLang = stopwordsLang || require(path); stopwords = stopwords.concat(stopwordsLang.stop_words); }); @@ -37,15 +57,15 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag var w=words[wc].toLowerCase().replace(/[^a-z\'A-Z0-9\u00C0-\u00ff ]+/g, ''); var wStemmed = stem(w); if (w=="" || !wStemmed || w.length==1 || stopwords.indexOf(w.replace("'", "")) > -1 || stopwords.indexOf(wStemmed) > -1 || w.indexOf("http")==0) continue; - if (f[wStemmed]) { + if (f[wStemmed]) { f[wStemmed]=f[wStemmed]+1; - } - else if(wStemmed) { - f[wStemmed]=1; + } + else if(wStemmed) { + f[wStemmed]=1; vocab.push(wStemmed); vocabOrig[wStemmed] = w; }; - + documents[i].push(vocab.indexOf(wStemmed)); } } @@ -78,14 +98,14 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag //console.log('Topic ' + (k + 1)); var row = []; - + for (var t = 0; t < topTerms; t++) { var topicTerm=things[t].split("_")[2]; var prob=parseInt(things[t].split("_")[0]*100); if (prob<2) continue; - + //console.log('Top Term: ' + topicTerm + ' (' + prob + '%)'); - + var term = {}; term.term = topicTerm; term.probability = parseFloat(things[t].split("_")[0]); @@ -95,12 +115,17 @@ var process = function(sentences, numberOfTopics, numberOfTermsPerTopic, languag result.push(row); } } - + return result; } +process.registerStopwords = function(language, path) { + STOP_WORDS_MAP[language] = path; + return this; +}; + function makeArray(x) { - var a = new Array(); + var a = new Array(); for (var i=0;i this.BURN_IN) && (this.SAMPLE_LAG > 0) && (i % this.SAMPLE_LAG == 0)) { this.updateParams(); - //document.write("|"); + //document.write("|"); if (i % this.THIN_INTERVAL != 0) this.dispcol++; } if (this.dispcol >= 100) { - //document.write("*
"); + //document.write("*
"); this.dispcol = 0; } } } - + this.sampleFullConditional = function(m,n) { var topic = this.z[m][n]; this.nw[this.documents[m][n]][topic]--; @@ -226,7 +251,7 @@ var lda = new function() { this.ndsum[m]++; return topic; } - + this.updateParams =function () { for (var m = 0; m < this.documents.length; m++) { for (var k = 0; k < this.K; k++) { @@ -240,7 +265,7 @@ var lda = new function() { } this.numstats++; } - + this.getTheta = function() { var theta = new Array(); for(var i=0;i 0) { @@ -258,7 +283,7 @@ var lda = new function() { } return theta; } - + this.getPhi = function () { var phi = new Array(); for(var i=0;i 0) { diff --git a/package.json b/package.json index 4218389..07ca2e6 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "lda", - "version": "0.2.0", + "version": "0.3.0", "description": "LDA topic modeling for node.js.", "author": { "name": "Kory Becker", @@ -13,7 +13,7 @@ }, "main": "./lib", "dependencies": { - "stem-porter": "*" + "stem-porter": "*" }, "engines": { "node": ">= 0.8.x" diff --git a/test4.js b/test4.js new file mode 100644 index 0000000..150fc59 --- /dev/null +++ b/test4.js @@ -0,0 +1,77 @@ +const lda = require('./lib/lda'); +const path = require('path'); + +lda.registerStopwords('en_override', path.resolve(__dirname, './lib/stopwords_en.js')); + +const collection = [ + [ + 'Ruby slippers are pretty and fun.', + 'Long walks in the park are fun.', + '', + 'Slippers are soft on your feet.' + ], + [ + 'Ruby slippers are pretty and fun.', + 'Long walks in the park are fun.', + null, + 'Slippers are soft on your feet.' + ], + [ + '', + 'Ruby slippers are pretty and fun.', + 'Long walks in the park are fun.', + 'Slippers are soft on your feet.' + ], + [ + null, + 'Ruby slippers are pretty and fun.', + 'Long walks in the park are fun.', + 'Slippers are soft on your feet.' + ], + [ + 'Ruby slippers are pretty and fun.', + 'Long walks in the park are fun.', + 'Slippers are soft on your feet.', + '' + ], + [ + 'Ruby slippers are pretty and fun.', + 'Long walks in the park are fun.', + 'Slippers are soft on your feet.', + null + ] +]; + +var probabilities = []; + +collection.forEach((documents, i) => { + const results = lda(documents, 3, 2, ['en_override'], null, null, 123); + + // Save the probabilities for each group. The values should be the same, since we're using the same random seed. + const groupProbs = []; + results.forEach(group => { + group.forEach(row => { + groupProbs.push(row.probability); + }); + }); + + // Store the entire group in an array. + probabilities.push(groupProbs); + + //console.log('\nSet ' + (i + 1)); + //console.log(results); +}); + +var success = true; + +// Verify the probabilities for each group are the same, even with empty and null values in the docs. +probabilities.forEach((group, i) => { + if (group[0] !== 0.15 || group[1] !== 0.14 || group[2] !== 0.16 || group[3] !== 0.15 || group[4] !== 0.16 || group[5] !== 0.14) { + console.log('Failed expected values for group ' + i); + success = false; + } +}); + +if (success) { + console.log('\nResult OK.'); +} \ No newline at end of file diff --git a/test5.js b/test5.js new file mode 100644 index 0000000..f91cc36 --- /dev/null +++ b/test5.js @@ -0,0 +1,84 @@ +var lda = require('./lib/lda'); + +var text = 'Hola, tu estas muy ocupada hoy? Esta bonita afuera, pero hace un poco de calor hoy. Tu tienes algo? Tu quieres ir a el banco? Tu puedes comprar algo aqui con tu dinero.'; +var documents = text.match( /[^\.!\?]+[\.!\?]+/g ); + +var result_en = lda(documents, 2, 5, ['en'], null, null, 123); +var result_es = lda(documents, 2, 5, ['es'], null, null, 123); +var result_multi = lda(documents, 2, 5, ['invalid1', 'en', 'es', 'invalid2'], null, null, 123); + +var findTerm = function(term, topics) { + for (var i in topics) { + var row = topics[i]; + //console.log('Topic ' + (parseInt(i) + 1)); + + // For each term. + for (var j in row) { + var aterm = row[j]; + //console.log(aterm.term + ' (' + aterm.probability + '%)'); + + if (aterm.term === term) { + console.log('Found "' + term + '"'); + return term; + } + } + + console.log(''); + } + + return null; +}; + +// For each topic. +var success = true; +var target_term = 'tu'; // Stop-words term that should be removed when using the designated stop-words list (i.e., spanish). + +// Look for the stop-word in the resulting topics using English and Spanish. The term should exist in English, but not in Spanish. +console.log('Using English stop-words.'); +var result1 = findTerm(target_term, result_en); +if (!result1) { + console.log('\nFailed English stop-words check! Failed to find expected stop-word: "' + target_term + '" as a topic.') + return; +} + +console.log('\nUsing Spanish stop-words.'); +var result2 = findTerm(target_term, result_es) +if (result2) { + console.log('\nFailed Spanish stop-words check! Found stop-word: "' + target_term + '" as a topic, when it should have been removed.') + return; +} + +console.log('\nUsing English and Spanish stop-words.'); +var result3 = findTerm(target_term, result_multi); +if (result3) { + console.log('\nFailed Multiple stop-words check! Found stop-word: "' + target_term + '" as a topic, when it should have been removed.') + return; +} + +// Confirm the probabilities are equal when using the Spanish stop-words list and a list containing Spanish and invalid stop-word paths. +const groupProbs1 = []; +result_es.forEach(group => { + group.forEach(row => { + groupProbs1.push(row.probability); + }); +}); + +const groupProbs2 = []; +result_multi.forEach(group => { + group.forEach(row => { + groupProbs2.push(row.probability); + }); +}); + +for (var i=0; i