saffsd · gr33ndata · Mar 5, 2014 · Mar 5, 2014 · Mar 6, 2014 · Mar 9, 2014
diff --git a/corpuslib/__init__.py b/corpuslib/__init__.py
@@ -0,0 +1 @@
+from test import Accuracy, Test
diff --git a/corpuslib/test.py b/corpuslib/test.py
@@ -0,0 +1,54 @@
+import os
+
+class Test:
+
+    def __init__(self, root='', langid=None, accuracy=None):
+        self.root = root
+        self.langid = langid
+        self.a = accuracy
+
+    def visit(self, arg, dirname, names):
+        path = dirname.split('/')
+
+        if len(path) == 1:
+            #print names
+            for i in range(len(names)-1,0,-1):
+                if names[i].startswith('.'):
+                    del names[i]
+            #print names
+        else:
+            lang = path[1]
+            #print arg, dirname, names
+            for name in names:
+                fd = open(dirname + '/' + name,'r')
+                for line in fd.readlines():
+                    res = self.langid.classify(line)
+                    #print lang, ':', res
+                    if lang == res[0]:
+                        self.a.update(correct=True)
+                    else:
+                        #print 'incorrect:', lang, res
+                        self.a.update(correct=False)
+                fd.close()
+
+    def start(self):
+        os.path.walk(self.root, self.visit, '')    
+
+class Accuracy:
+
+    def __init__(self):
+        self.correct = 0
+        self.incorrect = 0
+
+    def update(self, correct=True):
+        if correct:
+            self.correct += 1
+        else:
+            self.incorrect += 1
+        #print 'updates', self.correct, self.incorrect
+
+    def evaluate(self):
+        total_cases = self.correct + self.incorrect
+        accuracy = self.correct * 100.0 / total_cases
+        print 'Accuracy = %f %% (of %d test cases)' % (accuracy, total_cases)
+
diff --git a/corpuslib/train.py b/corpuslib/train.py
diff --git a/langid/__init__.py b/langid/__init__.py
@@ -1 +1 @@
-from langid import classify, rank, set_languages
+from langid import classify, rank, set_languages, load_model
diff --git a/langid/langid.py b/langid/langid.py
@@ -38,6 +38,8 @@
 FORCE_WSGIREF = False
 NORM_PROBS = True # Normalize optput probabilities.
 
+SILENT = True
+
 # NORM_PROBS can be set to False for a small speed increase. It does not
 # affect the relative ordering of the predicted classes. 
 
@@ -102,7 +104,7 @@ def rank(instance):
   global identifier
   if identifier is None:
     load_model()
-
+  
   return identifier.rank(instance)
 
 def cl_path(path):
@@ -156,6 +158,8 @@ class LanguageIdentifier(object):
 
   @classmethod
   def from_modelstring(cls, string, *args, **kwargs):
+    if not SILENT:
+      print 'Loading model from string'
     model = loads(bz2.decompress(base64.b64decode(string)))
     nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = model
     nb_numfeats = len(nb_ptc) / len(nb_pc)
@@ -168,6 +172,8 @@ def from_modelstring(cls, string, *args, **kwargs):
 
   @classmethod
   def from_modelpath(cls, path, *args, **kwargs):
+    if not SILENT:
+      print 'Loading model from path'
     with open(path) as f:
       return cls.from_modelstring(f.read(), *args, **kwargs)
 

diff --git a/langid/train/DFfeatureselect.py b/langid/train/DFfeatureselect.py
@@ -38,7 +38,9 @@
 # Can be overriden with command-line options
 ######
 MAX_NGRAM_ORDER = 4 # largest order of n-grams to consider
-TOKENS_PER_ORDER = 15000 # number of tokens to consider for each order
+TOKENS_PER_ORDER = 15000 # number of tokens to consider for each order
+
+SILENT = True
 
 import os, sys, argparse
 import collections
@@ -88,8 +90,9 @@ def tally(bucketlist, jobs=None):
   with MapPool(jobs) as f:
     pass_sum_df_out = f(pass_sum_df, bucketlist)
 
-    for i, keycount in enumerate(pass_sum_df_out):
-      print "processed bucket (%d/%d) [%d keys]" % (i+1, len(bucketlist), keycount)
+    for i, keycount in enumerate(pass_sum_df_out):
+      if not SILENT:  
+        print "processed bucket (%d/%d) [%d keys]" % (i+1, len(bucketlist), keycount)
 
   # build the global term->df mapping
   doc_count = {}
@@ -146,35 +149,42 @@ def ngram_select(doc_count, max_order=MAX_NGRAM_ORDER, tokens_per_order=TOKENS_P
   else:
     bucketlist_path = os.path.join(args.model, 'bucketlist')
 
-  # display paths
-  print "buckets path:", bucketlist_path
-  print "features output path:", feature_path
-  if args.tokens_per_order:
-    print "max ngram order:", args.max_order
-    print "tokens per order:", args.tokens_per_order
-  else:
-    print "tokens:", args.tokens
+  # display paths
+  if not SILENT:
+    print "buckets path:", bucketlist_path
+    print "features output path:", feature_path
+  if args.tokens_per_order:
+    if not SILENT:
+      print "max ngram order:", args.max_order
+      print "tokens per order:", args.tokens_per_order
+  else:
+    if not SILENT:
+      print "tokens:", args.tokens
 
   with open(bucketlist_path) as f:
     bucketlist = map(str.strip, f)
 
-  doc_count = tally(bucketlist, args.jobs)
-  print "unique features:", len(doc_count)
+  doc_count = tally(bucketlist, args.jobs)
+  if not SILENT:
+    print "unique features:", len(doc_count)
   if args.doc_count:
     # The constant true is used to indicate output to default location
     doc_count_path = os.path.join(args.model, 'DF_all') if args.doc_count == True else args.doc_count
-    write_weights(doc_count, doc_count_path)
-    print "wrote DF counts for all features to:", doc_count_path
+    write_weights(doc_count, doc_count_path)
+    if not SILENT:
+      print "wrote DF counts for all features to:", doc_count_path
 
   if args.tokens_per_order:
     # Choose a number of features for each length of token
     feats = ngram_select(doc_count, args.max_order, args.tokens_per_order)
   else:
     # Choose a number of features overall
-    feats = sorted( sorted(doc_count, key=doc_count.get, reverse=True)[:args.tokens] )
-  print "selected features: ", len(feats)
+    feats = sorted( sorted(doc_count, key=doc_count.get, reverse=True)[:args.tokens] )
+  if not SILENT:  
+    print "selected features: ", len(feats)
 
-  write_features(feats, feature_path)
-  print 'wrote features to "%s"' % feature_path 
+  write_features(feats, feature_path)
+  if not SILENT:
+    print 'wrote features to "%s"' % feature_path 
 
 
diff --git a/langid/train/IGweight.py b/langid/train/IGweight.py
@@ -32,7 +32,9 @@
 The views and conclusions contained in the software and documentation are those of the
 authors and should not be interpreted as representing official policies, either expressed
 or implied, of the copyright holder.
-"""
+"""
+
+SILENT = True
 
 import os, sys, argparse 
 import csv
@@ -89,15 +91,18 @@ def pass_IG(buckets):
   @param buckets a list of buckets. Each bucket must be a directory that contains files 
                  with the appropriate suffix. Each file must contain marshalled 
                  (term, event_id, count) triplets.
-  """
+  """
+
   global __features, __dist, __binarize, __suffix
 
   # We first tally the per-event frequency of each
   # term in our selected feature set.
   term_freq = defaultdict(lambda: defaultdict(int))
   term_index = defaultdict(Enumerator())
-
-  for bucket in buckets:
+
+  for bucket in buckets:
+  		if not SILENT:
+  			print 'bucket:', bucket
 		for path in os.listdir(bucket):
 			if path.endswith(__suffix):
 				for key, event_id, count in unmarshal_iter(os.path.join(bucket,path)):
@@ -165,8 +170,9 @@ def compute_IG(bucketlist, features, dist, binarize, suffix, job_count=None):
 
     for i, (t, w) in enumerate(pass_IG_out):
       weights.append(w)
-      terms.extend(t)
-      print "processed chunk (%d/%d) [%d terms]" % (i+1, num_chunk, len(t))
+      terms.extend(t)
+      if not SILENT:
+        print "processed chunk (%d/%d) [%d terms]" % (i+1, num_chunk, len(t))
 
   if binarize:
     weights = numpy.hstack(weights).transpose()
@@ -229,22 +235,26 @@ def read_dist(path):
   else:
     weights_path = os.path.join(args.model, 'IGweights' + suffix + ('.bin' if args.binarize else ''))
 
-  # display paths
-  print "model path:", args.model 
-  print "buckets path:", bucketlist_paths
-  print "features path:", feature_path
-  print "weights path:", weights_path
-  print "index path:", index_path
-  print "suffix:", suffix
-
-  print "computing information gain"
+  # display paths
+  if not SILENT:
+    print "model path:", args.model 
+    print "buckets path:", bucketlist_paths
+    print "features path:", feature_path
+    print "weights path:", weights_path
+    print "index path:", index_path
+    print "suffix:", suffix
+
+  if not SILENT:  
+    print "computing information gain"
   # Compile buckets together
   bucketlist = zip(*(map(str.strip, open(p)) for p in bucketlist_paths))
 
   # Check that each bucketlist has the same number of buckets
   assert len(set(map(len,bucketlist))) == 1, "incompatible bucketlists!"
 
-  dist = read_dist(index_path)
+  dist = read_dist(index_path)
+  if not SILENT:
+    print 'bucketlist:', bucketlist
   ig = compute_IG(bucketlist, features, dist, args.binarize, suffix, args.jobs)
 
   write_weights(ig, weights_path)
diff --git a/langid/train/LDfeatureselect.py b/langid/train/LDfeatureselect.py
@@ -37,7 +37,9 @@
 # Default values
 # Can be overriden with command-line options
 ######
-FEATURES_PER_LANG = 300 # number of features to select for each language
+FEATURES_PER_LANG = 300 # number of features to select for each language
+
+SILENT = True
 
 import os, sys, argparse
 import csv
@@ -90,11 +92,12 @@ def select_LD_features(ig_lang, ig_domain, feats_per_lang, ignore_domain=False):
   domain_w_path = os.path.join(args.model, 'IGweights.domain')
   feature_path = args.output if args.output else os.path.join(args.model, 'LDfeats')
 
-  # display paths
-  print "model path:", args.model
-  print "lang weights path:", lang_w_path
-  print "domain weights path:", domain_w_path
-  print "feature output path:", feature_path
+  # display paths
+  if not SILENT:
+    print "model path:", args.model
+    print "lang weights path:", lang_w_path
+    print "domain weights path:", domain_w_path
+    print "feature output path:", feature_path
 
   lang_w = read_weights(lang_w_path)
   domain_w = read_weights(domain_w_path) if not args.no_domain_ig else None
@@ -107,9 +110,11 @@ def select_LD_features(ig_lang, ig_domain, feats_per_lang, ignore_domain=False):
         writer.writerow(map(repr,features_per_lang[i]))
 
 
-  final_feature_set = reduce(set.union, map(set, features_per_lang.values()))
-  print 'selected %d features' % len(final_feature_set)
+  final_feature_set = reduce(set.union, map(set, features_per_lang.values()))
+  if not SILENT:
+    print 'selected %d features' % len(final_feature_set)
 
-  write_features(sorted(final_feature_set), feature_path)
-  print 'wrote features to "%s"' % feature_path 
+  write_features(sorted(final_feature_set), feature_path)
+  if not SILENT:
+    print 'wrote features to "%s"' % feature_path 
 
diff --git a/langid/train/NBtrain.py b/langid/train/NBtrain.py
@@ -35,6 +35,8 @@
 """
 MAX_CHUNK_SIZE = 100 # maximum number of files to tokenize at once
 NUM_BUCKETS = 64 # number of buckets to use in k-v pair generation
+
+SILENT = True 
 
 import base64, bz2, cPickle
 import os, sys, argparse, csv
@@ -131,7 +133,7 @@ def pass_ptc(b_dir):
   read_count = 0
   for path in os.listdir(b_dir):
     if path.endswith('.index'):
-      for f_id, doc_id, count in unmarshal_iter(os.path.join(b_dir, path)):
+      for f_id, doc_id, count in unmarshal_iter(os.path.join(b_dir, path),do_gzip=False):
         terms[f_id][doc_id] = count
         read_count += 1
 
@@ -189,16 +191,18 @@ def learn_ptc(paths, tk_nextmove, tk_output, cm, temp_path, args):
   with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f:
     pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg)
 
-  write_count = sum(pass_tokenize_out)
-  print "wrote a total of %d keys" % write_count
+  write_count = sum(pass_tokenize_out)
+  if not SILENT:
+    print "wrote a total of %d keys" % write_count
 
   pass_ptc_params = (cm, num_instances)
   with MapPool(args.jobs, setup_pass_ptc, pass_ptc_params) as f:
     pass_ptc_out = f(pass_ptc, b_dirs)
 
   reads, ids, prods = zip(*pass_ptc_out)
-  read_count = sum(reads)
-  print "read a total of %d keys (%d short)" % (read_count, write_count - read_count)
+  read_count = sum(reads)
+  if not SILENT:
+    print "read a total of %d keys (%d short)" % (read_count, write_count - read_count)
 
   prod = np.zeros((num_features, cm.shape[1]), dtype=int)
   prod[np.concatenate(ids)] = np.vstack(prods)
@@ -249,12 +253,13 @@ def cleanup():
   index_path = os.path.join(args.model, 'paths')
   lang_path = os.path.join(args.model, 'lang_index')
 
-  # display paths
-  print "model path:", args.model
-  print "temp path:", temp_path
-  print "scanner path:", scanner_path
-  #print "index path:", index_path
-  print "output path:", output_path
+  # display paths
+  if not SILENT:
+    print "model path:", args.model
+    print "temp path:", temp_path
+    print "scanner path:", scanner_path
+    print "index path:", index_path
+    print "output path:", output_path
 
   # read list of training files
   with open(index_path) as f:
@@ -281,5 +286,6 @@ def cleanup():
   model = nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output
   string = base64.b64encode(bz2.compress(cPickle.dumps(model)))
   with open(output_path, 'w') as f:
-    f.write(string)
-  print "wrote model to %s (%d bytes)" % (output_path, len(string))
+    f.write(string)
+  if not SILENT:  
+    print "wrote model to %s (%d bytes)" % (output_path, len(string))
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from langid import classify, rank, set_languages
		from langid import classify, rank, set_languages, load_model