Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions examples/tweeter/nlp/language_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
###START-CONF
##{
##"object_name": "language_filter",
##"object_poi": "qpwo-2345",
##"auto-load": true,
##"remoting" : true,
##"parameters": [
## {
## "name": "tweet",
## "description": "",
## "required": true,
## "type": "TweetString",
## "format": "",
## "state" : "RAW"
## }
## ],
##"return": [
## {
## "name": "tweet",
## "description": "topic detector",
## "required": true,
## "type": "TweetString",
## "format": "",
## "state" : "ENGLISH"
## }
##
## ] }
##END-CONF

# supports: DANISH|DUTCH|ENGLISH|FINNISH|FRENCH|GERMAN|HUNGARIAN|ITALIAN|NORWEGIAN|PORTUGUESE|RUSSIAN|SPANISH|SWEDISH|TURKISH

import re, os, time
import urllib2
from random import randint
from pumpkin import PmkSeed
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords

class language_filter(PmkSeed.Seed):

def __init__(self, context, poi=None):
PmkSeed.Seed.__init__(self, context,poi)
self.wd = self.context.getWorkingDir()

def on_load(self):
print "Loading: " + self.__class__.__name__

def detect_language(self, text):

words = [word.lower() for word in wordpunct_tokenize(text)]
result = (None, -1)

for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
ratio = float(len(common_elements)) / len(stopwords_set)

if ratio > result[1]:
result = (language, ratio)

return result[0]

def run(self, pkt, tweet):
m = re.search('W(\s+)(.*)(\n)', tweet, re.S)
if m:
tw = m.group(2)
if len(tw) > 10:
language = self.detect_language(tw)
if language == 'english':
self.dispatch(pkt, tweet, 'ENGLISH')
64 changes: 64 additions & 0 deletions examples/tweeter/nlp/named_entity_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
###START-CONF
##{
##"object_name": "named_entity_filter",
##"object_poi": "qpwo-2345",
##"auto-load": true,
##"remoting" : true,
##"parameters": [
## {
## "name": "tweet",
## "description": "",
## "required": true,
## "type": "TweetString",
## "format": "",
## "state" : "POSITIVE"
## }
## ],
##"return": [
## {
## "name": "tweet",
## "description": "named entity extractor",
## "required": true,
## "type": "TweetString",
## "format": "",
## "state" : "ENTITIES"
## }
##
## ] }
##END-CONF

import re, os, time
import urllib2
from random import randint
from pumpkin import PmkSeed
from nltk import sent_tokenize, word_tokenize, pos_tag, ne_chunk

class named_entity_filter(PmkSeed.Seed):

def __init__(self, context, poi=None):
PmkSeed.Seed.__init__(self, context,poi)
self.wd = self.context.getWorkingDir()

def on_load(self):
print "Loading: " + self.__class__.__name__

def extract_named_entities(self, text):
sentences = sent_tokenize(text)
sentences = [word_tokenize(sent) for sent in sentences]
sentences = [pos_tag(sent) for sent in sentences]
result = []
for sent in sentences:
result += [word[0] for word, tag in ne_chunk(sent, binary=True).pos()
if tag == 'NE']
return result

def run(self, pkt, tweet):
m = re.search('W(\s+)(.*)(\n)', tweet, re.S)
if m:
tw = m.group(2)
self.logger.info("named_entity_filter: " + tw)
entities = self.extract_named_entities(tw)
if len(entities) > 0:
self.logger.info("named_entity_filter: |" + "| ".join(entities))
self.dispatch(pkt, ", ".join(entities), 'ENTITIES')

129 changes: 129 additions & 0 deletions examples/tweeter/nlp/topic_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
###START-CONF
##{
##"object_name": "topic_filter",
##"object_poi": "qpwo-2345",
##"auto-load": true,
##"remoting" : true,
##"parameters": [
## {
## "name": "tweet",
## "description": "",
## "required": true,
## "type": "TweetString",
## "format": "",
## "state" : "ENGLISH"
## }
## ],
##"return": [
## {
## "name": "tweet",
## "description": "topic detector",
## "required": true,
## "type": "TweetString",
## "format": "",
## "state" : "MOVIE"
## }
##
## ] }
##END-CONF

import re, os, time
import urllib2
from random import randint
from pumpkin import PmkSeed
from nltk.corpus import reuters, movie_reviews
from operator import itemgetter
import nltk, pickle

class topic_filter(PmkSeed.Seed):

def __init__(self, context, poi=None):
PmkSeed.Seed.__init__(self, context,poi)
self.wd = self.context.getWorkingDir()

def on_load(self):
print "Loading: " + self.__class__.__name__
url = "https://www.dropbox.com/s/qn6o8r3liq5jxv4/topic_detection_data.pickle?dl=1"
file_name = self.wd+"topic_detection_data.pickle"
self.get_net_file(url, file_name)
self.td = TopicDetector(file_name)

def get_net_file(self, url, file_name):
#file_name = url.split('/')[-1]
downloaded = False
while not downloaded:
try:
u = urllib2.urlopen(url)
f = open(file_name, 'wb')
meta = u.info()
file_size = int(meta.getheaders("Content-Length")[0])
self.logger.info ("Downloading: %s Bytes: %s" % (file_name, file_size))

file_size_dl = 0
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break

file_size_dl += len(buffer)
f.write(buffer)
#status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
#status = status + chr(8)*(len(status)+1)
#print status,
f.close()
downloaded = True
except Exception as e:
self.logger.error("Error downloading, trying again....")
time.sleep(5)
pass

def run(self, pkt, tweet):
m = re.search('W(\s+)(.*)(\n)', tweet, re.S)
if m:
tw = m.group(2)
if self.td.is_topic('movies', tw):
self.logger.info("topic_filter: topic found in " + tw)
self.dispatch(pkt, tweet, "MOVIE")


class TopicDetector:
def __init__(self, path_to_data=None):
self._load_vector(path_to_data)
self.words = map(itemgetter(0), self.vector)
self.topics = ["movies"]

def _load_vector(self, path_to_data):
if not path_to_data:
path_to_data = '/tmp/stats.pickle'
self.vector = None
try:
data_file = open(path_to_data, 'rb')
self.vector = pickle.load(data_file)
except IOError:
data_file = open(path_to_data, 'wb')

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words() if len(w) > 3)
all_words_r = nltk.FreqDist(w.lower() for w in reuters.words() if len(w) > 3)

self.vector = []

for word in all_words.keys():
ratio = 0
try:
ratio = all_words.freq(word) / all_words_r.freq(word)
except ZeroDivisionError:
next
self.vector.append((word, ratio))
self.vector.sort(key=itemgetter(1), reverse=True)
self.vector = self.vector[:200]

pickle.dump(self.vector, data_file)
def is_topic(self, topic, text):
if topic not in self.topics:
None # todo: more topics than movies

words = set([word.lower() for word in nltk.wordpunct_tokenize(text)])
inter = words.intersection(self.words)
return len(inter) > 1

8 changes: 4 additions & 4 deletions examples/tweeter/soa/sentiment_analyses.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
## "required": true,
## "type": "TweetString",
## "format": "",
## "state" : "ENGLISH"
## "state" : "MOVIE"
## }
## ],
##"return": [
Expand All @@ -21,7 +21,7 @@
## "required": true,
## "type": "TweetString",
## "format": "",
## "state" : "POSITIVE|NEGATIVE"
## "state" : "POSITIVE"
## }
##
## ] }
Expand Down Expand Up @@ -100,5 +100,5 @@ def run(self, pkt, tweet):
tw = m.group(2)
if self.check(tw):
self.dispatch(pkt, tweet, "POSITIVE")
else:
self.dispatch(pkt, tweet, "NEGATIVE")
# else:
# self.dispatch(pkt, tweet, "NEGATIVE")