Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions corpuslib/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from test import Accuracy, Test
54 changes: 54 additions & 0 deletions corpuslib/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os

class Test:

def __init__(self, root='', langid=None, accuracy=None):
self.root = root
self.langid = langid
self.a = accuracy

def visit(self, arg, dirname, names):
path = dirname.split('/')

if len(path) == 1:
#print names
for i in range(len(names)-1,0,-1):
if names[i].startswith('.'):
del names[i]
#print names
else:
lang = path[1]
#print arg, dirname, names
for name in names:
fd = open(dirname + '/' + name,'r')
for line in fd.readlines():
res = self.langid.classify(line)
#print lang, ':', res
if lang == res[0]:
self.a.update(correct=True)
else:
#print 'incorrect:', lang, res
self.a.update(correct=False)
fd.close()

def start(self):
os.path.walk(self.root, self.visit, '')

class Accuracy:

def __init__(self):
self.correct = 0
self.incorrect = 0

def update(self, correct=True):
if correct:
self.correct += 1
else:
self.incorrect += 1
#print 'updates', self.correct, self.incorrect

def evaluate(self):
total_cases = self.correct + self.incorrect
accuracy = self.correct * 100.0 / total_cases
print 'Accuracy = %f %% (of %d test cases)' % (accuracy, total_cases)

Empty file added corpuslib/train.py
Empty file.
2 changes: 1 addition & 1 deletion langid/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from langid import classify, rank, set_languages
from langid import classify, rank, set_languages, load_model
8 changes: 7 additions & 1 deletion langid/langid.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
FORCE_WSGIREF = False
NORM_PROBS = True # Normalize optput probabilities.

SILENT = True

# NORM_PROBS can be set to False for a small speed increase. It does not
# affect the relative ordering of the predicted classes.

Expand Down Expand Up @@ -102,7 +104,7 @@ def rank(instance):
global identifier
if identifier is None:
load_model()

return identifier.rank(instance)

def cl_path(path):
Expand Down Expand Up @@ -156,6 +158,8 @@ class LanguageIdentifier(object):

@classmethod
def from_modelstring(cls, string, *args, **kwargs):
if not SILENT:
print 'Loading model from string'
model = loads(bz2.decompress(base64.b64decode(string)))
nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = model
nb_numfeats = len(nb_ptc) / len(nb_pc)
Expand All @@ -168,6 +172,8 @@ def from_modelstring(cls, string, *args, **kwargs):

@classmethod
def from_modelpath(cls, path, *args, **kwargs):
if not SILENT:
print 'Loading model from path'
with open(path) as f:
return cls.from_modelstring(f.read(), *args, **kwargs)

Expand Down
48 changes: 29 additions & 19 deletions langid/train/DFfeatureselect.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@
# Can be overriden with command-line options
######
MAX_NGRAM_ORDER = 4 # largest order of n-grams to consider
TOKENS_PER_ORDER = 15000 # number of tokens to consider for each order
TOKENS_PER_ORDER = 15000 # number of tokens to consider for each order

SILENT = True

import os, sys, argparse
import collections
Expand Down Expand Up @@ -88,8 +90,9 @@ def tally(bucketlist, jobs=None):
with MapPool(jobs) as f:
pass_sum_df_out = f(pass_sum_df, bucketlist)

for i, keycount in enumerate(pass_sum_df_out):
print "processed bucket (%d/%d) [%d keys]" % (i+1, len(bucketlist), keycount)
for i, keycount in enumerate(pass_sum_df_out):
if not SILENT:
print "processed bucket (%d/%d) [%d keys]" % (i+1, len(bucketlist), keycount)

# build the global term->df mapping
doc_count = {}
Expand Down Expand Up @@ -146,35 +149,42 @@ def ngram_select(doc_count, max_order=MAX_NGRAM_ORDER, tokens_per_order=TOKENS_P
else:
bucketlist_path = os.path.join(args.model, 'bucketlist')

# display paths
print "buckets path:", bucketlist_path
print "features output path:", feature_path
if args.tokens_per_order:
print "max ngram order:", args.max_order
print "tokens per order:", args.tokens_per_order
else:
print "tokens:", args.tokens
# display paths
if not SILENT:
print "buckets path:", bucketlist_path
print "features output path:", feature_path
if args.tokens_per_order:
if not SILENT:
print "max ngram order:", args.max_order
print "tokens per order:", args.tokens_per_order
else:
if not SILENT:
print "tokens:", args.tokens

with open(bucketlist_path) as f:
bucketlist = map(str.strip, f)

doc_count = tally(bucketlist, args.jobs)
print "unique features:", len(doc_count)
doc_count = tally(bucketlist, args.jobs)
if not SILENT:
print "unique features:", len(doc_count)
if args.doc_count:
# The constant true is used to indicate output to default location
doc_count_path = os.path.join(args.model, 'DF_all') if args.doc_count == True else args.doc_count
write_weights(doc_count, doc_count_path)
print "wrote DF counts for all features to:", doc_count_path
write_weights(doc_count, doc_count_path)
if not SILENT:
print "wrote DF counts for all features to:", doc_count_path

if args.tokens_per_order:
# Choose a number of features for each length of token
feats = ngram_select(doc_count, args.max_order, args.tokens_per_order)
else:
# Choose a number of features overall
feats = sorted( sorted(doc_count, key=doc_count.get, reverse=True)[:args.tokens] )
print "selected features: ", len(feats)
feats = sorted( sorted(doc_count, key=doc_count.get, reverse=True)[:args.tokens] )
if not SILENT:
print "selected features: ", len(feats)

write_features(feats, feature_path)
print 'wrote features to "%s"' % feature_path
write_features(feats, feature_path)
if not SILENT:
print 'wrote features to "%s"' % feature_path


42 changes: 26 additions & 16 deletions langid/train/IGweight.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@
The views and conclusions contained in the software and documentation are those of the
authors and should not be interpreted as representing official policies, either expressed
or implied, of the copyright holder.
"""
"""

SILENT = True

import os, sys, argparse
import csv
Expand Down Expand Up @@ -89,15 +91,18 @@ def pass_IG(buckets):
@param buckets a list of buckets. Each bucket must be a directory that contains files
with the appropriate suffix. Each file must contain marshalled
(term, event_id, count) triplets.
"""
"""

global __features, __dist, __binarize, __suffix

# We first tally the per-event frequency of each
# term in our selected feature set.
term_freq = defaultdict(lambda: defaultdict(int))
term_index = defaultdict(Enumerator())

for bucket in buckets:

for bucket in buckets:
if not SILENT:
print 'bucket:', bucket
for path in os.listdir(bucket):
if path.endswith(__suffix):
for key, event_id, count in unmarshal_iter(os.path.join(bucket,path)):
Expand Down Expand Up @@ -165,8 +170,9 @@ def compute_IG(bucketlist, features, dist, binarize, suffix, job_count=None):

for i, (t, w) in enumerate(pass_IG_out):
weights.append(w)
terms.extend(t)
print "processed chunk (%d/%d) [%d terms]" % (i+1, num_chunk, len(t))
terms.extend(t)
if not SILENT:
print "processed chunk (%d/%d) [%d terms]" % (i+1, num_chunk, len(t))

if binarize:
weights = numpy.hstack(weights).transpose()
Expand Down Expand Up @@ -229,22 +235,26 @@ def read_dist(path):
else:
weights_path = os.path.join(args.model, 'IGweights' + suffix + ('.bin' if args.binarize else ''))

# display paths
print "model path:", args.model
print "buckets path:", bucketlist_paths
print "features path:", feature_path
print "weights path:", weights_path
print "index path:", index_path
print "suffix:", suffix

print "computing information gain"
# display paths
if not SILENT:
print "model path:", args.model
print "buckets path:", bucketlist_paths
print "features path:", feature_path
print "weights path:", weights_path
print "index path:", index_path
print "suffix:", suffix

if not SILENT:
print "computing information gain"
# Compile buckets together
bucketlist = zip(*(map(str.strip, open(p)) for p in bucketlist_paths))

# Check that each bucketlist has the same number of buckets
assert len(set(map(len,bucketlist))) == 1, "incompatible bucketlists!"

dist = read_dist(index_path)
dist = read_dist(index_path)
if not SILENT:
print 'bucketlist:', bucketlist
ig = compute_IG(bucketlist, features, dist, args.binarize, suffix, args.jobs)

write_weights(ig, weights_path)
25 changes: 15 additions & 10 deletions langid/train/LDfeatureselect.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@
# Default values
# Can be overriden with command-line options
######
FEATURES_PER_LANG = 300 # number of features to select for each language
FEATURES_PER_LANG = 300 # number of features to select for each language

SILENT = True

import os, sys, argparse
import csv
Expand Down Expand Up @@ -90,11 +92,12 @@ def select_LD_features(ig_lang, ig_domain, feats_per_lang, ignore_domain=False):
domain_w_path = os.path.join(args.model, 'IGweights.domain')
feature_path = args.output if args.output else os.path.join(args.model, 'LDfeats')

# display paths
print "model path:", args.model
print "lang weights path:", lang_w_path
print "domain weights path:", domain_w_path
print "feature output path:", feature_path
# display paths
if not SILENT:
print "model path:", args.model
print "lang weights path:", lang_w_path
print "domain weights path:", domain_w_path
print "feature output path:", feature_path

lang_w = read_weights(lang_w_path)
domain_w = read_weights(domain_w_path) if not args.no_domain_ig else None
Expand All @@ -107,9 +110,11 @@ def select_LD_features(ig_lang, ig_domain, feats_per_lang, ignore_domain=False):
writer.writerow(map(repr,features_per_lang[i]))


final_feature_set = reduce(set.union, map(set, features_per_lang.values()))
print 'selected %d features' % len(final_feature_set)
final_feature_set = reduce(set.union, map(set, features_per_lang.values()))
if not SILENT:
print 'selected %d features' % len(final_feature_set)

write_features(sorted(final_feature_set), feature_path)
print 'wrote features to "%s"' % feature_path
write_features(sorted(final_feature_set), feature_path)
if not SILENT:
print 'wrote features to "%s"' % feature_path

32 changes: 19 additions & 13 deletions langid/train/NBtrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
"""
MAX_CHUNK_SIZE = 100 # maximum number of files to tokenize at once
NUM_BUCKETS = 64 # number of buckets to use in k-v pair generation

SILENT = True

import base64, bz2, cPickle
import os, sys, argparse, csv
Expand Down Expand Up @@ -131,7 +133,7 @@ def pass_ptc(b_dir):
read_count = 0
for path in os.listdir(b_dir):
if path.endswith('.index'):
for f_id, doc_id, count in unmarshal_iter(os.path.join(b_dir, path)):
for f_id, doc_id, count in unmarshal_iter(os.path.join(b_dir, path),do_gzip=False):
terms[f_id][doc_id] = count
read_count += 1

Expand Down Expand Up @@ -189,16 +191,18 @@ def learn_ptc(paths, tk_nextmove, tk_output, cm, temp_path, args):
with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f:
pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg)

write_count = sum(pass_tokenize_out)
print "wrote a total of %d keys" % write_count
write_count = sum(pass_tokenize_out)
if not SILENT:
print "wrote a total of %d keys" % write_count

pass_ptc_params = (cm, num_instances)
with MapPool(args.jobs, setup_pass_ptc, pass_ptc_params) as f:
pass_ptc_out = f(pass_ptc, b_dirs)

reads, ids, prods = zip(*pass_ptc_out)
read_count = sum(reads)
print "read a total of %d keys (%d short)" % (read_count, write_count - read_count)
read_count = sum(reads)
if not SILENT:
print "read a total of %d keys (%d short)" % (read_count, write_count - read_count)

prod = np.zeros((num_features, cm.shape[1]), dtype=int)
prod[np.concatenate(ids)] = np.vstack(prods)
Expand Down Expand Up @@ -249,12 +253,13 @@ def cleanup():
index_path = os.path.join(args.model, 'paths')
lang_path = os.path.join(args.model, 'lang_index')

# display paths
print "model path:", args.model
print "temp path:", temp_path
print "scanner path:", scanner_path
#print "index path:", index_path
print "output path:", output_path
# display paths
if not SILENT:
print "model path:", args.model
print "temp path:", temp_path
print "scanner path:", scanner_path
print "index path:", index_path
print "output path:", output_path

# read list of training files
with open(index_path) as f:
Expand All @@ -281,5 +286,6 @@ def cleanup():
model = nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output
string = base64.b64encode(bz2.compress(cPickle.dumps(model)))
with open(output_path, 'w') as f:
f.write(string)
print "wrote model to %s (%d bytes)" % (output_path, len(string))
f.write(string)
if not SILENT:
print "wrote model to %s (%d bytes)" % (output_path, len(string))
Loading