Skip to content

Commit 9945cdc

Browse files
author
Animesh Pandey
committed
Added Checkstyle and other refactoring
Added checkstyle checking over whole project which resulted in extensive refactoring of the whole library. Also added JavaDocs to whole project.
1 parent 35910a3 commit 9945cdc

File tree

12 files changed

+518
-374
lines changed

12 files changed

+518
-374
lines changed

plugin.iml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<configuration sdkName="Python 2.7.11 (E:\Miniconda2\python.exe)" />
66
</facet>
77
</component>
8-
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8" inherit-compiler-output="false">
8+
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
99
<output url="file://$MODULE_DIR$/target/classes" />
1010
<output-test url="file://$MODULE_DIR$/target/test-classes" />
1111
<content url="file://$MODULE_DIR$">
@@ -19,10 +19,10 @@
1919
<orderEntry type="inheritedJdk" />
2020
<orderEntry type="sourceFolder" forTests="false" />
2121
<orderEntry type="library" name="Python 2.7.11 (E:\Miniconda2\python.exe) interpreter library" level="application" />
22-
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-analyzers-common:6.4.1" level="project" />
23-
<orderEntry type="library" name="Maven: org.apache.lucene:lucene-core:6.4.1" level="project" />
22+
<orderEntry type="library" scope="PROVIDED" name="Maven: org.apache.lucene:lucene-analyzers-common:6.4.1" level="project" />
23+
<orderEntry type="library" scope="PROVIDED" name="Maven: org.apache.lucene:lucene-core:6.4.1" level="project" />
2424
<orderEntry type="library" name="Maven: commons-lang:commons-lang:2.6" level="project" />
25-
<orderEntry type="library" name="Maven: log4j:log4j:1.2.17" level="project" />
25+
<orderEntry type="library" scope="PROVIDED" name="Maven: log4j:log4j:1.2.17" level="project" />
2626
<orderEntry type="library" scope="TEST" name="Maven: junit:junit:4.12" level="project" />
2727
<orderEntry type="library" scope="TEST" name="Maven: org.hamcrest:hamcrest-core:1.3" level="project" />
2828
</component>

pom.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@
153153
<groupId>org.apache.lucene</groupId>
154154
<artifactId>lucene-analyzers-common</artifactId>
155155
<version>6.4.1</version>
156-
<!--<scope>provided</scope>-->
156+
<scope>provided</scope>
157157
</dependency>
158158
<dependency>
159159
<groupId>commons-lang</groupId>
@@ -164,7 +164,7 @@
164164
<groupId>log4j</groupId>
165165
<artifactId>log4j</artifactId>
166166
<version>1.2.17</version>
167-
<!--<scope>provided</scope>-->
167+
<scope>provided</scope>
168168
</dependency>
169169
<dependency>
170170
<groupId>junit</groupId>

src/main/checkstyle/checkstyle.xml

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,11 @@
3232
<property name="acceptOnMatch" value="false"/>
3333
</module>
3434
<module name="SuppressionCommentFilter">
35-
<!--
36-
Use suppressions.xml for suppressions, this is only example.
37-
checkFormat will prevent suppression comments from being valid.
38-
-->
39-
<property name="checkFormat" value="IGNORETHIS"/>
40-
<property name="offCommentFormat" value="CSOFF\: .*"/>
41-
<property name="onCommentFormat" value="CSON\: .*"/>
35+
<property name="offCommentFormat" value="CHECKSTYLE.OFF\: ([\w\|]+)"/>
36+
<property name="onCommentFormat" value="CHECKSTYLE.ON\: ([\w\|]+)"/>
37+
<property name="checkFormat" value="$1"/>
4238
</module>
39+
4340
<!--<module name="SuppressionFilter">-->
4441
<!--<property name="file" value="${checkstyle.suppressions.file}"/>-->
4542
<!--</module>-->
@@ -261,6 +258,7 @@
261258
<module name="RequireThis"/>
262259
<module name="ReturnCount">
263260
<property name="maxForVoid" value="0"/>
261+
<property name="max" value="5"/>
264262
</module>
265263
<module name="SimplifyBooleanExpression"/>
266264
<module name="SimplifyBooleanReturn"/>

src/main/java/com/vader/sentiment/analyzer/SentimentAnalyzer.java

Lines changed: 172 additions & 157 deletions
Large diffs are not rendered by default.

src/main/java/com/vader/sentiment/processor/InputAnalyzer.java

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,11 @@
3030
import java.util.List;
3131

3232
import org.apache.lucene.analysis.TokenStream;
33+
import org.apache.lucene.analysis.Tokenizer;
3334
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
3435
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
3536
import org.apache.lucene.analysis.standard.StandardTokenizer;
3637
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
37-
import org.apache.lucene.analysis.Tokenizer;
3838

3939
/**
4040
* This class defines a Lucene analyzer that is applied on the input string in
@@ -46,7 +46,6 @@
4646
class InputAnalyzer implements InputAnalyzerInterface {
4747
/**
4848
* This function applies a Lucene analyzer that splits a string into a tokens.
49-
* <p>
5049
* Here we are using two types of Lucene {@link Tokenizer}s:
5150
* 1. {@link WhitespaceTokenizer} which tokenizes from the white spaces
5251
* 2. {@link StandardTokenizer} which tokenizes from white space as well as removed any punctuations
@@ -57,15 +56,20 @@ class InputAnalyzer implements InputAnalyzerInterface {
5756
* @throws IOException if Lucene's analyzer encounters any error
5857
*/
5958
private List<String> tokenize(String inputString, boolean removePunctuation) throws IOException {
60-
StringReader reader = new StringReader(inputString);
61-
Tokenizer currentTokenizer = (removePunctuation) ? new StandardTokenizer() : new WhitespaceTokenizer();
59+
final StringReader reader = new StringReader(inputString);
60+
final Tokenizer currentTokenizer;
61+
if (removePunctuation) {
62+
currentTokenizer = new StandardTokenizer();
63+
} else {
64+
currentTokenizer = new WhitespaceTokenizer();
65+
}
6266
currentTokenizer.setReader(reader);
6367

64-
TokenStream tokenStream = new LengthFilter(currentTokenizer, 2, Integer.MAX_VALUE);
68+
final TokenStream tokenStream = new LengthFilter(currentTokenizer, 2, Integer.MAX_VALUE);
6569
final CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
6670
tokenStream.reset();
6771

68-
ArrayList<String> tokenizedString = new ArrayList<>();
72+
final List<String> tokenizedString = new ArrayList<>();
6973
while (tokenStream.incrementToken()) {
7074
tokenizedString.add(charTermAttribute.toString());
7175
}
@@ -77,25 +81,19 @@ private List<String> tokenize(String inputString, boolean removePunctuation) thr
7781
}
7882

7983
/**
80-
* This is {@link InputAnalyzer#tokenize(String, boolean)} with removePunctuation set as false. So, this
81-
* method performs tokenization without removing punctuations.
84+
* Implementation of {@link InputAnalyzerInterface#defaultSplit(String)}.
8285
*
83-
* @param inputString The input string to be pre-processed with Lucene tokenizer
84-
* @return tokens
85-
* @throws IOException if Lucene's analyzer encounters any error
86+
* {@inheritDoc}
8687
*/
8788
@Override
8889
public List<String> defaultSplit(String inputString) throws IOException {
8990
return tokenize(inputString, false);
9091
}
9192

9293
/**
93-
* This is {@link InputAnalyzer#tokenize(String, boolean)} with removePunctuation set as false. So, this
94-
* method performs tokenization without removing punctuations.
94+
* Implementation of {@link InputAnalyzerInterface#removePunctuation(String)}.
9595
*
96-
* @param inputString The input string to be pre-processed with Lucene tokenizer
97-
* @return tokens
98-
* @throws IOException if Lucene's analyzer encounters any error
96+
* {@inheritDoc}
9997
*/
10098
@Override
10199
public List<String> removePunctuation(String inputString) throws IOException {

src/main/java/com/vader/sentiment/processor/InputAnalyzerInterface.java

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,18 +33,22 @@
3333
*/
3434
interface InputAnalyzerInterface {
3535
/**
36+
* This is {@link InputAnalyzer#tokenize(String, boolean)} with removePunctuation set as false. So, this
37+
* method performs tokenization without removing punctuations.
3638
*
37-
* @param inputString
38-
* @return
39-
* @throws IOException
39+
* @param inputString The input string to be pre-processed with Lucene tokenizer
40+
* @return tokens
41+
* @throws IOException if Lucene's analyzer encounters any error
4042
*/
4143
List<String> defaultSplit(String inputString) throws IOException;
4244

4345
/**
46+
* This is {@link InputAnalyzer#tokenize(String, boolean)} with removePunctuation set as false. So, this
47+
* method performs tokenization without removing punctuations.
4448
*
45-
* @param inputString
46-
* @return
47-
* @throws IOException
49+
* @param inputString The input string to be pre-processed with Lucene tokenizer
50+
* @return tokens
51+
* @throws IOException if Lucene's analyzer encounters any error
4852
*/
4953
List<String> removePunctuation(String inputString) throws IOException;
5054
}

src/main/java/com/vader/sentiment/processor/TextProperties.java

Lines changed: 66 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -24,24 +24,52 @@
2424

2525
package com.vader.sentiment.processor;
2626

27-
import com.vader.sentiment.util.Utils;
28-
2927
import java.io.IOException;
3028
import java.util.Collections;
3129
import java.util.List;
3230

31+
import com.vader.sentiment.util.Utils;
32+
3333
/**
3434
* The TextProperties class implements the pre-processing steps of the input string for sentiment analysis.
35-
* It utilizes the Lucene analyzers
35+
* It utilizes the Lucene analyzer to perform processing on the input string.
3636
*
3737
* @author Animesh Pandey
3838
* Created on 4/10/2016.
3939
*/
40-
public class TextProperties {
40+
public final class TextProperties {
41+
/**
42+
* String whose properties will be extracted.
43+
*/
4144
private String inputText;
45+
46+
/**
47+
* List of tokens and emoticons extracted from the {@link TextProperties#inputText}.
48+
*/
4249
private List<String> wordsAndEmoticons;
50+
51+
/**
52+
* List of tokens extracted from the {@link TextProperties#inputText}.
53+
* Emoticons are removed here.
54+
*/
4355
private List<String> wordsOnly;
44-
private boolean isCapDIff;
56+
57+
/**
58+
* Flags that specifies if the current string has yelling words.
59+
*/
60+
private boolean isCapDiff;
61+
62+
/**
63+
* Parameterized constructor accepting the input string that will be processed.
64+
*
65+
* @param inputText the input string
66+
* @throws IOException if there is an issue with the lucene analyzers
67+
*/
68+
public TextProperties(String inputText) throws IOException {
69+
this.inputText = inputText;
70+
setWordsAndEmoticons();
71+
setCapDiff(isAllCapDifferential());
72+
}
4573

4674
/**
4775
* This method tokenizes the input string, preserving the punctuation marks using
@@ -52,61 +80,43 @@ public class TextProperties {
5280
private void setWordsAndEmoticons() throws IOException {
5381
setWordsOnly();
5482

55-
List<String> wordsAndEmoticonsList = new InputAnalyzer().defaultSplit(inputText);
83+
final List<String> wordsAndEmoticonsList = new InputAnalyzer().defaultSplit(inputText);
5684
for (String currentWord : wordsOnly) {
5785
for (String currentPunc : Utils.PUNCTUATION_LIST) {
58-
String pWord = currentWord + currentPunc;
59-
Integer pWordCount = Collections.frequency(wordsAndEmoticonsList, pWord);
60-
while (pWordCount > 0) {
61-
int index = wordsAndEmoticonsList.indexOf(pWord);
62-
wordsAndEmoticonsList.remove(pWord);
86+
final String wordPunct = currentWord + currentPunc;
87+
Integer wordPunctCount = Collections.frequency(wordsAndEmoticonsList, wordPunct);
88+
while (wordPunctCount > 0) {
89+
final int index = wordsAndEmoticonsList.indexOf(wordPunct);
90+
wordsAndEmoticonsList.remove(wordPunct);
6391
wordsAndEmoticonsList.add(index, currentWord);
64-
pWordCount = Collections.frequency(wordsAndEmoticonsList, pWord);
92+
wordPunctCount = Collections.frequency(wordsAndEmoticonsList, wordPunct);
6593
}
6694

67-
String wordP = currentPunc + currentWord;
68-
Integer wordPCount = Collections.frequency(wordsAndEmoticonsList, wordP);
69-
while (wordPCount > 0) {
70-
int index = wordsAndEmoticonsList.indexOf(wordP);
71-
wordsAndEmoticonsList.remove(wordP);
95+
final String punctWord = currentPunc + currentWord;
96+
Integer punctWordCount = Collections.frequency(wordsAndEmoticonsList, punctWord);
97+
while (punctWordCount > 0) {
98+
final int index = wordsAndEmoticonsList.indexOf(punctWord);
99+
wordsAndEmoticonsList.remove(punctWord);
72100
wordsAndEmoticonsList.add(index, currentWord);
73-
wordPCount = Collections.frequency(wordsAndEmoticonsList, wordP);
101+
punctWordCount = Collections.frequency(wordsAndEmoticonsList, punctWord);
74102
}
75103
}
76104
}
77105
this.wordsAndEmoticons = wordsAndEmoticonsList;
78106
}
79107

80108
/**
81-
* This method tokenizes the input string, removing the special characters as well
109+
* This method tokenizes the input string, removing the special characters as well.
82110
*
83-
* @throws IOException
111+
* @throws IOException iff there is an error which using Lucene analyzers.
84112
* @see InputAnalyzer#removePunctuation(String)
85113
*/
86114
private void setWordsOnly() throws IOException {
87115
this.wordsOnly = new InputAnalyzer().removePunctuation(inputText);
88116
}
89117

90-
private void setCapDiff(boolean capDIff) {
91-
isCapDIff = capDIff;
92-
}
93-
94-
/**
95-
* @return True iff the input has yelling words i.e. all caps in the tokens, but all the token should not be
96-
* in upper case.
97-
* e.g. [GET, THE, HELL, OUT] returns false
98-
* [GET, the, HELL, OUT] returns true
99-
* [get, the, hell, out] returns false
100-
*/
101-
private boolean isAllCapDifferential() {
102-
int countAllCaps = 0;
103-
for (String s : wordsAndEmoticons) {
104-
if (Utils.isUpper(s)) {
105-
countAllCaps++;
106-
}
107-
}
108-
int capDifferential = wordsAndEmoticons.size() - countAllCaps;
109-
return (0 < capDifferential) && (capDifferential < wordsAndEmoticons.size());
118+
private void setCapDiff(boolean capDiff) {
119+
this.isCapDiff = capDiff;
110120
}
111121

112122
public List<String> getWordsAndEmoticons() {
@@ -118,25 +128,26 @@ public List<String> getWordsOnly() {
118128
}
119129

120130
public boolean isCapDiff() {
121-
return isCapDIff;
131+
return isCapDiff;
122132
}
123133

124134
/**
135+
* Return true iff the input has yelling words i.e. all caps in the tokens, but all the token should not be
136+
* in upper case.
137+
* e.g. [GET, THE, HELL, OUT] returns false
138+
* [GET, the, HELL, OUT] returns true
139+
* [get, the, hell, out] returns false
125140
*
126-
* @param inputText
127-
* @throws IOException
141+
* @return boolean value
128142
*/
129-
public TextProperties(String inputText) throws IOException {
130-
this.inputText = inputText;
131-
setWordsAndEmoticons();
132-
setCapDiff(isAllCapDifferential());
133-
}
134-
135-
public static void main(String[] args) throws IOException {
136-
String input = "The plot was good, but the characters are uncompelling and the dialog is not great. :( :(";
137-
TextProperties properties = new TextProperties(input);
138-
System.out.println(properties.getWordsOnly());
139-
System.out.println(properties.getWordsAndEmoticons());
140-
System.out.println(properties.isCapDiff());
143+
private boolean isAllCapDifferential() {
144+
int countAllCaps = 0;
145+
for (String token : wordsAndEmoticons) {
146+
if (Utils.isUpper(token)) {
147+
countAllCaps++;
148+
}
149+
}
150+
final int capDifferential = wordsAndEmoticons.size() - countAllCaps;
151+
return (0 < capDifferential) && (capDifferential < wordsAndEmoticons.size());
141152
}
142153
}

0 commit comments

Comments
 (0)