24
24
25
25
package com .vader .sentiment .processor ;
26
26
27
- import com .vader .sentiment .util .Utils ;
28
-
29
27
import java .io .IOException ;
30
28
import java .util .Collections ;
31
29
import java .util .List ;
32
30
31
+ import com .vader .sentiment .util .Utils ;
32
+
33
33
/**
34
34
* The TextProperties class implements the pre-processing steps of the input string for sentiment analysis.
35
- * It utilizes the Lucene analyzers
35
+ * It utilizes the Lucene analyzer to perform processing on the input string.
36
36
*
37
37
* @author Animesh Pandey
38
38
* Created on 4/10/2016.
39
39
*/
40
- public class TextProperties {
40
+ public final class TextProperties {
41
+ /**
42
+ * String whose properties will be extracted.
43
+ */
41
44
private String inputText ;
45
+
46
+ /**
47
+ * List of tokens and emoticons extracted from the {@link TextProperties#inputText}.
48
+ */
42
49
private List <String > wordsAndEmoticons ;
50
+
51
+ /**
52
+ * List of tokens extracted from the {@link TextProperties#inputText}.
53
+ * Emoticons are removed here.
54
+ */
43
55
private List <String > wordsOnly ;
44
- private boolean isCapDIff ;
56
+
57
+ /**
58
+ * Flags that specifies if the current string has yelling words.
59
+ */
60
+ private boolean isCapDiff ;
61
+
62
+ /**
63
+ * Parameterized constructor accepting the input string that will be processed.
64
+ *
65
+ * @param inputText the input string
66
+ * @throws IOException if there is an issue with the lucene analyzers
67
+ */
68
+ public TextProperties (String inputText ) throws IOException {
69
+ this .inputText = inputText ;
70
+ setWordsAndEmoticons ();
71
+ setCapDiff (isAllCapDifferential ());
72
+ }
45
73
46
74
/**
47
75
* This method tokenizes the input string, preserving the punctuation marks using
@@ -52,61 +80,43 @@ public class TextProperties {
52
80
private void setWordsAndEmoticons () throws IOException {
53
81
setWordsOnly ();
54
82
55
- List <String > wordsAndEmoticonsList = new InputAnalyzer ().defaultSplit (inputText );
83
+ final List <String > wordsAndEmoticonsList = new InputAnalyzer ().defaultSplit (inputText );
56
84
for (String currentWord : wordsOnly ) {
57
85
for (String currentPunc : Utils .PUNCTUATION_LIST ) {
58
- String pWord = currentWord + currentPunc ;
59
- Integer pWordCount = Collections .frequency (wordsAndEmoticonsList , pWord );
60
- while (pWordCount > 0 ) {
61
- int index = wordsAndEmoticonsList .indexOf (pWord );
62
- wordsAndEmoticonsList .remove (pWord );
86
+ final String wordPunct = currentWord + currentPunc ;
87
+ Integer wordPunctCount = Collections .frequency (wordsAndEmoticonsList , wordPunct );
88
+ while (wordPunctCount > 0 ) {
89
+ final int index = wordsAndEmoticonsList .indexOf (wordPunct );
90
+ wordsAndEmoticonsList .remove (wordPunct );
63
91
wordsAndEmoticonsList .add (index , currentWord );
64
- pWordCount = Collections .frequency (wordsAndEmoticonsList , pWord );
92
+ wordPunctCount = Collections .frequency (wordsAndEmoticonsList , wordPunct );
65
93
}
66
94
67
- String wordP = currentPunc + currentWord ;
68
- Integer wordPCount = Collections .frequency (wordsAndEmoticonsList , wordP );
69
- while (wordPCount > 0 ) {
70
- int index = wordsAndEmoticonsList .indexOf (wordP );
71
- wordsAndEmoticonsList .remove (wordP );
95
+ final String punctWord = currentPunc + currentWord ;
96
+ Integer punctWordCount = Collections .frequency (wordsAndEmoticonsList , punctWord );
97
+ while (punctWordCount > 0 ) {
98
+ final int index = wordsAndEmoticonsList .indexOf (punctWord );
99
+ wordsAndEmoticonsList .remove (punctWord );
72
100
wordsAndEmoticonsList .add (index , currentWord );
73
- wordPCount = Collections .frequency (wordsAndEmoticonsList , wordP );
101
+ punctWordCount = Collections .frequency (wordsAndEmoticonsList , punctWord );
74
102
}
75
103
}
76
104
}
77
105
this .wordsAndEmoticons = wordsAndEmoticonsList ;
78
106
}
79
107
80
108
/**
81
- * This method tokenizes the input string, removing the special characters as well
109
+ * This method tokenizes the input string, removing the special characters as well.
82
110
*
83
- * @throws IOException
111
+ * @throws IOException iff there is an error which using Lucene analyzers.
84
112
* @see InputAnalyzer#removePunctuation(String)
85
113
*/
86
114
private void setWordsOnly () throws IOException {
87
115
this .wordsOnly = new InputAnalyzer ().removePunctuation (inputText );
88
116
}
89
117
90
- private void setCapDiff (boolean capDIff ) {
91
- isCapDIff = capDIff ;
92
- }
93
-
94
- /**
95
- * @return True iff the input has yelling words i.e. all caps in the tokens, but all the token should not be
96
- * in upper case.
97
- * e.g. [GET, THE, HELL, OUT] returns false
98
- * [GET, the, HELL, OUT] returns true
99
- * [get, the, hell, out] returns false
100
- */
101
- private boolean isAllCapDifferential () {
102
- int countAllCaps = 0 ;
103
- for (String s : wordsAndEmoticons ) {
104
- if (Utils .isUpper (s )) {
105
- countAllCaps ++;
106
- }
107
- }
108
- int capDifferential = wordsAndEmoticons .size () - countAllCaps ;
109
- return (0 < capDifferential ) && (capDifferential < wordsAndEmoticons .size ());
118
+ private void setCapDiff (boolean capDiff ) {
119
+ this .isCapDiff = capDiff ;
110
120
}
111
121
112
122
public List <String > getWordsAndEmoticons () {
@@ -118,25 +128,26 @@ public List<String> getWordsOnly() {
118
128
}
119
129
120
130
public boolean isCapDiff () {
121
- return isCapDIff ;
131
+ return isCapDiff ;
122
132
}
123
133
124
134
/**
135
+ * Return true iff the input has yelling words i.e. all caps in the tokens, but all the token should not be
136
+ * in upper case.
137
+ * e.g. [GET, THE, HELL, OUT] returns false
138
+ * [GET, the, HELL, OUT] returns true
139
+ * [get, the, hell, out] returns false
125
140
*
126
- * @param inputText
127
- * @throws IOException
141
+ * @return boolean value
128
142
*/
129
- public TextProperties (String inputText ) throws IOException {
130
- this .inputText = inputText ;
131
- setWordsAndEmoticons ();
132
- setCapDiff (isAllCapDifferential ());
133
- }
134
-
135
- public static void main (String [] args ) throws IOException {
136
- String input = "The plot was good, but the characters are uncompelling and the dialog is not great. :( :(" ;
137
- TextProperties properties = new TextProperties (input );
138
- System .out .println (properties .getWordsOnly ());
139
- System .out .println (properties .getWordsAndEmoticons ());
140
- System .out .println (properties .isCapDiff ());
143
+ private boolean isAllCapDifferential () {
144
+ int countAllCaps = 0 ;
145
+ for (String token : wordsAndEmoticons ) {
146
+ if (Utils .isUpper (token )) {
147
+ countAllCaps ++;
148
+ }
149
+ }
150
+ final int capDifferential = wordsAndEmoticons .size () - countAllCaps ;
151
+ return (0 < capDifferential ) && (capDifferential < wordsAndEmoticons .size ());
141
152
}
142
153
}
0 commit comments