diff --git a/examples/tweeter/nlp/language_filter.py b/examples/tweeter/nlp/language_filter.py new file mode 100644 index 0000000..455c9a9 --- /dev/null +++ b/examples/tweeter/nlp/language_filter.py @@ -0,0 +1,71 @@ +###START-CONF +##{ +##"object_name": "language_filter", +##"object_poi": "qpwo-2345", +##"auto-load": true, +##"remoting" : true, +##"parameters": [ +## { +## "name": "tweet", +## "description": "", +## "required": true, +## "type": "TweetString", +## "format": "", +## "state" : "RAW" +## } +## ], +##"return": [ +## { +## "name": "tweet", +## "description": "topic detector", +## "required": true, +## "type": "TweetString", +## "format": "", +## "state" : "ENGLISH" +## } +## +## ] } +##END-CONF + +# supports: DANISH|DUTCH|ENGLISH|FINNISH|FRENCH|GERMAN|HUNGARIAN|ITALIAN|NORWEGIAN|PORTUGUESE|RUSSIAN|SPANISH|SWEDISH|TURKISH + +import re, os, time +import urllib2 +from random import randint +from pumpkin import PmkSeed +from nltk import wordpunct_tokenize +from nltk.corpus import stopwords + +class language_filter(PmkSeed.Seed): + + def __init__(self, context, poi=None): + PmkSeed.Seed.__init__(self, context,poi) + self.wd = self.context.getWorkingDir() + + def on_load(self): + print "Loading: " + self.__class__.__name__ + + def detect_language(self, text): + + words = [word.lower() for word in wordpunct_tokenize(text)] + result = (None, -1) + + for language in stopwords.fileids(): + stopwords_set = set(stopwords.words(language)) + words_set = set(words) + common_elements = words_set.intersection(stopwords_set) + ratio = float(len(common_elements)) / len(stopwords_set) + + if ratio > result[1]: + result = (language, ratio) + + return result[0] + + def run(self, pkt, tweet): + m = re.search('W(\s+)(.*)(\n)', tweet, re.S) + if m: + tw = m.group(2) + if len(tw) > 10: + language = self.detect_language(tw) + if language == 'english': + self.dispatch(pkt, tweet, 'ENGLISH') diff --git a/examples/tweeter/nlp/named_entity_filter.py b/examples/tweeter/nlp/named_entity_filter.py new file mode 100644 index 0000000..6a08296 --- /dev/null +++ b/examples/tweeter/nlp/named_entity_filter.py @@ -0,0 +1,64 @@ +###START-CONF +##{ +##"object_name": "named_entity_filter", +##"object_poi": "qpwo-2345", +##"auto-load": true, +##"remoting" : true, +##"parameters": [ +## { +## "name": "tweet", +## "description": "", +## "required": true, +## "type": "TweetString", +## "format": "", +## "state" : "POSITIVE" +## } +## ], +##"return": [ +## { +## "name": "tweet", +## "description": "named entity extractor", +## "required": true, +## "type": "TweetString", +## "format": "", +## "state" : "ENTITIES" +## } +## +## ] } +##END-CONF + +import re, os, time +import urllib2 +from random import randint +from pumpkin import PmkSeed +from nltk import sent_tokenize, word_tokenize, pos_tag, ne_chunk + +class named_entity_filter(PmkSeed.Seed): + + def __init__(self, context, poi=None): + PmkSeed.Seed.__init__(self, context,poi) + self.wd = self.context.getWorkingDir() + + def on_load(self): + print "Loading: " + self.__class__.__name__ + + def extract_named_entities(self, text): + sentences = sent_tokenize(text) + sentences = [word_tokenize(sent) for sent in sentences] + sentences = [pos_tag(sent) for sent in sentences] + result = [] + for sent in sentences: + result += [word[0] for word, tag in ne_chunk(sent, binary=True).pos() + if tag == 'NE'] + return result + + def run(self, pkt, tweet): + m = re.search('W(\s+)(.*)(\n)', tweet, re.S) + if m: + tw = m.group(2) + self.logger.info("named_entity_filter: " + tw) + entities = self.extract_named_entities(tw) + if len(entities) > 0: + self.logger.info("named_entity_filter: |" + "| ".join(entities)) + self.dispatch(pkt, ", ".join(entities), 'ENTITIES') + diff --git a/examples/tweeter/nlp/topic_filter.py b/examples/tweeter/nlp/topic_filter.py new file mode 100644 index 0000000..b6941f9 --- /dev/null +++ b/examples/tweeter/nlp/topic_filter.py @@ -0,0 +1,129 @@ +###START-CONF +##{ +##"object_name": "topic_filter", +##"object_poi": "qpwo-2345", +##"auto-load": true, +##"remoting" : true, +##"parameters": [ +## { +## "name": "tweet", +## "description": "", +## "required": true, +## "type": "TweetString", +## "format": "", +## "state" : "ENGLISH" +## } +## ], +##"return": [ +## { +## "name": "tweet", +## "description": "topic detector", +## "required": true, +## "type": "TweetString", +## "format": "", +## "state" : "MOVIE" +## } +## +## ] } +##END-CONF + +import re, os, time +import urllib2 +from random import randint +from pumpkin import PmkSeed +from nltk.corpus import reuters, movie_reviews +from operator import itemgetter +import nltk, pickle + +class topic_filter(PmkSeed.Seed): + + def __init__(self, context, poi=None): + PmkSeed.Seed.__init__(self, context,poi) + self.wd = self.context.getWorkingDir() + + def on_load(self): + print "Loading: " + self.__class__.__name__ + url = "https://www.dropbox.com/s/qn6o8r3liq5jxv4/topic_detection_data.pickle?dl=1" + file_name = self.wd+"topic_detection_data.pickle" + self.get_net_file(url, file_name) + self.td = TopicDetector(file_name) + + def get_net_file(self, url, file_name): + #file_name = url.split('/')[-1] + downloaded = False + while not downloaded: + try: + u = urllib2.urlopen(url) + f = open(file_name, 'wb') + meta = u.info() + file_size = int(meta.getheaders("Content-Length")[0]) + self.logger.info ("Downloading: %s Bytes: %s" % (file_name, file_size)) + + file_size_dl = 0 + block_sz = 8192 + while True: + buffer = u.read(block_sz) + if not buffer: + break + + file_size_dl += len(buffer) + f.write(buffer) + #status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size) + #status = status + chr(8)*(len(status)+1) + #print status, + f.close() + downloaded = True + except Exception as e: + self.logger.error("Error downloading, trying again....") + time.sleep(5) + pass + + def run(self, pkt, tweet): + m = re.search('W(\s+)(.*)(\n)', tweet, re.S) + if m: + tw = m.group(2) + if self.td.is_topic('movies', tw): + self.logger.info("topic_filter: topic found in " + tw) + self.dispatch(pkt, tweet, "MOVIE") + + +class TopicDetector: + def __init__(self, path_to_data=None): + self._load_vector(path_to_data) + self.words = map(itemgetter(0), self.vector) + self.topics = ["movies"] + + def _load_vector(self, path_to_data): + if not path_to_data: + path_to_data = '/tmp/stats.pickle' + self.vector = None + try: + data_file = open(path_to_data, 'rb') + self.vector = pickle.load(data_file) + except IOError: + data_file = open(path_to_data, 'wb') + + all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words() if len(w) > 3) + all_words_r = nltk.FreqDist(w.lower() for w in reuters.words() if len(w) > 3) + + self.vector = [] + + for word in all_words.keys(): + ratio = 0 + try: + ratio = all_words.freq(word) / all_words_r.freq(word) + except ZeroDivisionError: + next + self.vector.append((word, ratio)) + self.vector.sort(key=itemgetter(1), reverse=True) + self.vector = self.vector[:200] + + pickle.dump(self.vector, data_file) + def is_topic(self, topic, text): + if topic not in self.topics: + None # todo: more topics than movies + + words = set([word.lower() for word in nltk.wordpunct_tokenize(text)]) + inter = words.intersection(self.words) + return len(inter) > 1 + diff --git a/examples/tweeter/soa/sentiment_analyses.py b/examples/tweeter/soa/sentiment_analyses.py index 9ceb311..9a3a312 100644 --- a/examples/tweeter/soa/sentiment_analyses.py +++ b/examples/tweeter/soa/sentiment_analyses.py @@ -11,7 +11,7 @@ ## "required": true, ## "type": "TweetString", ## "format": "", -## "state" : "ENGLISH" +## "state" : "MOVIE" ## } ## ], ##"return": [ @@ -21,7 +21,7 @@ ## "required": true, ## "type": "TweetString", ## "format": "", -## "state" : "POSITIVE|NEGATIVE" +## "state" : "POSITIVE" ## } ## ## ] } @@ -100,5 +100,5 @@ def run(self, pkt, tweet): tw = m.group(2) if self.check(tw): self.dispatch(pkt, tweet, "POSITIVE") - else: - self.dispatch(pkt, tweet, "NEGATIVE") + # else: + # self.dispatch(pkt, tweet, "NEGATIVE")