From a7e1e9c53ec028bc6ace80ee6e4a90bfce1a3282 Mon Sep 17 00:00:00 2001
From: Tarek Amr <gr33ndata@yahoo.com>
Date: Wed, 5 Mar 2014 02:57:23 +0200
Subject: [PATCH 1/5] Fixing gzip problem in NBtrain.py, by doing some changes
 in it and in common.py as well

---
 langid/train/NBtrain.py |  2 +-
 langid/train/common.py  | 28 +++++++++++++++++++---------
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/langid/train/NBtrain.py b/langid/train/NBtrain.py
index 75b6f6e6..f24f688f 100644
--- a/langid/train/NBtrain.py
+++ b/langid/train/NBtrain.py
@@ -131,7 +131,7 @@ def pass_ptc(b_dir):
   read_count = 0
   for path in os.listdir(b_dir):
     if path.endswith('.index'):
-      for f_id, doc_id, count in unmarshal_iter(os.path.join(b_dir, path)):
+      for f_id, doc_id, count in unmarshal_iter(os.path.join(b_dir, path),do_gzip=False):
         terms[f_id][doc_id] = count
         read_count += 1
 
diff --git a/langid/train/common.py b/langid/train/common.py
index 8a31b656..c1880f74 100644
--- a/langid/train/common.py
+++ b/langid/train/common.py
@@ -32,18 +32,28 @@ def chunk(seq, chunksize):
     if not chunk: break
     yield chunk
 
-def unmarshal_iter(path):
+def unmarshal_iter(path, do_gzip=True):
   """
   Open a given path and yield an iterator over items unmarshalled from it.
   """
-  with gzip.open(path, 'rb') as f, tempfile.TemporaryFile() as t:
-    t.write(f.read())
-    t.seek(0)
-    while True:
-      try:
-        yield marshal.load(t)
-      except EOFError:
-        break
+  if do_gzip:
+    with gzip.open(path, 'rb') as f, tempfile.TemporaryFile() as t:
+      t.write(f.read())
+      t.seek(0)
+      while True:
+        try:
+          yield marshal.load(t)
+        except EOFError:
+          break
+  else:
+    with open(path, 'rb') as f, tempfile.TemporaryFile() as t:
+      t.write(f.read())
+      t.seek(0)
+      while True:
+        try:
+          yield marshal.load(t)
+        except EOFError:
+          break
 
 import os, errno
 def makedir(path):

From e7924010b05141f3667e45d67d5137389c73c8ad Mon Sep 17 00:00:00 2001
From: Tarek Amr <gr33ndata@yahoo.com>
Date: Wed, 5 Mar 2014 17:40:17 +0200
Subject: [PATCH 2/5] Adding the ability to skip hidden files starting with dot
 in their names

---
 langid/train/index.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/langid/train/index.py b/langid/train/index.py
index 138e84ab..0b7cd7fb 100644
--- a/langid/train/index.py
+++ b/langid/train/index.py
@@ -74,10 +74,11 @@ class CorpusIndexer(object):
   """
   Class to index the contents of a corpus
   """
-  def __init__(self, root, min_domain=MIN_DOMAIN, proportion=TRAIN_PROP, langs=None, domains=None):
+  def __init__(self, root, min_domain=MIN_DOMAIN, proportion=TRAIN_PROP, langs=None, domains=None, ignore_hidden_files=True):
     self.root = root
     self.min_domain = min_domain
-    self.proportion = proportion 
+    self.proportion = proportion 
+    self.ignore_hidden_files = ignore_hidden_files
 
     if langs is None:
       self.lang_index = defaultdict(Enumerator())
@@ -102,8 +103,10 @@ def index(self, root):
       # root supplied was the root of a directory structure
       candidates = []
       for dirpath, dirnames, filenames in os.walk(root, followlinks=True):
-        for docname in filenames:
-          candidates.append((dirpath, docname))
+        for docname in filenames:
+          # Ignore hidden files starting with dot if told to do so
+          if self.ignore_hidden_files and not docname.startswith('.'):
+            candidates.append((dirpath, docname))
     else:
       # root supplied was a file, interpet as list of paths
       candidates = [os.path.split(str.strip(l)) for l in open(root)]

From dd191b4d7edd0c505ec6f1195911a4198bb059a1 Mon Sep 17 00:00:00 2001
From: Tarek Amr <gr33ndata@yahoo.com>
Date: Thu, 6 Mar 2014 14:23:18 +0200
Subject: [PATCH 3/5] Adding mytrain.py to do all training at once, also edited
 other training components to make them silent, no print outs

---
 langid/train/DFfeatureselect.py | 48 ++++++++++++++---------
 langid/train/IGweight.py        | 42 ++++++++++++--------
 langid/train/LDfeatureselect.py | 25 +++++++-----
 langid/train/NBtrain.py         | 30 ++++++++------
 langid/train/index.py           | 30 ++++++++------
 langid/train/mytrain.py         | 47 ++++++++++++++++++++++
 langid/train/scanner.py         | 21 ++++++----
 langid/train/tokenize.py        | 69 ++++++++++++++++++++-------------
 langid/train/train.py           |  1 +
 9 files changed, 208 insertions(+), 105 deletions(-)
 create mode 100644 langid/train/mytrain.py

diff --git a/langid/train/DFfeatureselect.py b/langid/train/DFfeatureselect.py
index ea69c4d0..a5007568 100644
--- a/langid/train/DFfeatureselect.py
+++ b/langid/train/DFfeatureselect.py
@@ -38,7 +38,9 @@
 # Can be overriden with command-line options
 ######
 MAX_NGRAM_ORDER = 4 # largest order of n-grams to consider
-TOKENS_PER_ORDER = 15000 # number of tokens to consider for each order
+TOKENS_PER_ORDER = 15000 # number of tokens to consider for each order
+
+SILENT = True
 
 import os, sys, argparse
 import collections
@@ -88,8 +90,9 @@ def tally(bucketlist, jobs=None):
   with MapPool(jobs) as f:
     pass_sum_df_out = f(pass_sum_df, bucketlist)
 
-    for i, keycount in enumerate(pass_sum_df_out):
-      print "processed bucket (%d/%d) [%d keys]" % (i+1, len(bucketlist), keycount)
+    for i, keycount in enumerate(pass_sum_df_out):
+      if not SILENT:  
+        print "processed bucket (%d/%d) [%d keys]" % (i+1, len(bucketlist), keycount)
 
   # build the global term->df mapping
   doc_count = {}
@@ -146,35 +149,42 @@ def ngram_select(doc_count, max_order=MAX_NGRAM_ORDER, tokens_per_order=TOKENS_P
   else:
     bucketlist_path = os.path.join(args.model, 'bucketlist')
 
-  # display paths
-  print "buckets path:", bucketlist_path
-  print "features output path:", feature_path
-  if args.tokens_per_order:
-    print "max ngram order:", args.max_order
-    print "tokens per order:", args.tokens_per_order
-  else:
-    print "tokens:", args.tokens
+  # display paths
+  if not SILENT:
+    print "buckets path:", bucketlist_path
+    print "features output path:", feature_path
+  if args.tokens_per_order:
+    if not SILENT:
+      print "max ngram order:", args.max_order
+      print "tokens per order:", args.tokens_per_order
+  else:
+    if not SILENT:
+      print "tokens:", args.tokens
 
   with open(bucketlist_path) as f:
     bucketlist = map(str.strip, f)
 
-  doc_count = tally(bucketlist, args.jobs)
-  print "unique features:", len(doc_count)
+  doc_count = tally(bucketlist, args.jobs)
+  if not SILENT:
+    print "unique features:", len(doc_count)
   if args.doc_count:
     # The constant true is used to indicate output to default location
     doc_count_path = os.path.join(args.model, 'DF_all') if args.doc_count == True else args.doc_count
-    write_weights(doc_count, doc_count_path)
-    print "wrote DF counts for all features to:", doc_count_path
+    write_weights(doc_count, doc_count_path)
+    if not SILENT:
+      print "wrote DF counts for all features to:", doc_count_path
 
   if args.tokens_per_order:
     # Choose a number of features for each length of token
     feats = ngram_select(doc_count, args.max_order, args.tokens_per_order)
   else:
     # Choose a number of features overall
-    feats = sorted( sorted(doc_count, key=doc_count.get, reverse=True)[:args.tokens] )
-  print "selected features: ", len(feats)
+    feats = sorted( sorted(doc_count, key=doc_count.get, reverse=True)[:args.tokens] )
+  if not SILENT:  
+    print "selected features: ", len(feats)
 
-  write_features(feats, feature_path)
-  print 'wrote features to "%s"' % feature_path 
+  write_features(feats, feature_path)
+  if not SILENT:
+    print 'wrote features to "%s"' % feature_path 
 
   
diff --git a/langid/train/IGweight.py b/langid/train/IGweight.py
index 4f59936d..5a11658e 100644
--- a/langid/train/IGweight.py
+++ b/langid/train/IGweight.py
@@ -32,7 +32,9 @@
 The views and conclusions contained in the software and documentation are those of the
 authors and should not be interpreted as representing official policies, either expressed
 or implied, of the copyright holder.
-"""
+"""
+
+SILENT = True
 
 import os, sys, argparse 
 import csv
@@ -89,15 +91,18 @@ def pass_IG(buckets):
   @param buckets a list of buckets. Each bucket must be a directory that contains files 
                  with the appropriate suffix. Each file must contain marshalled 
                  (term, event_id, count) triplets.
-  """
+  """
+
   global __features, __dist, __binarize, __suffix
    
   # We first tally the per-event frequency of each
   # term in our selected feature set.
   term_freq = defaultdict(lambda: defaultdict(int))
   term_index = defaultdict(Enumerator())
-
-  for bucket in buckets:
+  
+  for bucket in buckets:
+  		if not SILENT:
+  			print 'bucket:', bucket
 		for path in os.listdir(bucket):
 			if path.endswith(__suffix):
 				for key, event_id, count in unmarshal_iter(os.path.join(bucket,path)):
@@ -165,8 +170,9 @@ def compute_IG(bucketlist, features, dist, binarize, suffix, job_count=None):
 
     for i, (t, w) in enumerate(pass_IG_out):
       weights.append(w)
-      terms.extend(t)
-      print "processed chunk (%d/%d) [%d terms]" % (i+1, num_chunk, len(t))
+      terms.extend(t)
+      if not SILENT:
+        print "processed chunk (%d/%d) [%d terms]" % (i+1, num_chunk, len(t))
 
   if binarize:
     weights = numpy.hstack(weights).transpose()
@@ -229,22 +235,26 @@ def read_dist(path):
   else:
     weights_path = os.path.join(args.model, 'IGweights' + suffix + ('.bin' if args.binarize else ''))
 
-  # display paths
-  print "model path:", args.model 
-  print "buckets path:", bucketlist_paths
-  print "features path:", feature_path
-  print "weights path:", weights_path
-  print "index path:", index_path
-  print "suffix:", suffix
-
-  print "computing information gain"
+  # display paths
+  if not SILENT:
+    print "model path:", args.model 
+    print "buckets path:", bucketlist_paths
+    print "features path:", feature_path
+    print "weights path:", weights_path
+    print "index path:", index_path
+    print "suffix:", suffix
+
+  if not SILENT:  
+    print "computing information gain"
   # Compile buckets together
   bucketlist = zip(*(map(str.strip, open(p)) for p in bucketlist_paths))
 
   # Check that each bucketlist has the same number of buckets
   assert len(set(map(len,bucketlist))) == 1, "incompatible bucketlists!"
 
-  dist = read_dist(index_path)
+  dist = read_dist(index_path)
+  if not SILENT:
+    print 'bucketlist:', bucketlist
   ig = compute_IG(bucketlist, features, dist, args.binarize, suffix, args.jobs)
 
   write_weights(ig, weights_path)
diff --git a/langid/train/LDfeatureselect.py b/langid/train/LDfeatureselect.py
index d8c11ee4..4c5837e0 100644
--- a/langid/train/LDfeatureselect.py
+++ b/langid/train/LDfeatureselect.py
@@ -37,7 +37,9 @@
 # Default values
 # Can be overriden with command-line options
 ######
-FEATURES_PER_LANG = 300 # number of features to select for each language
+FEATURES_PER_LANG = 300 # number of features to select for each language
+
+SILENT = True
 
 import os, sys, argparse
 import csv
@@ -90,11 +92,12 @@ def select_LD_features(ig_lang, ig_domain, feats_per_lang, ignore_domain=False):
   domain_w_path = os.path.join(args.model, 'IGweights.domain')
   feature_path = args.output if args.output else os.path.join(args.model, 'LDfeats')
 
-  # display paths
-  print "model path:", args.model
-  print "lang weights path:", lang_w_path
-  print "domain weights path:", domain_w_path
-  print "feature output path:", feature_path
+  # display paths
+  if not SILENT:
+    print "model path:", args.model
+    print "lang weights path:", lang_w_path
+    print "domain weights path:", domain_w_path
+    print "feature output path:", feature_path
 
   lang_w = read_weights(lang_w_path)
   domain_w = read_weights(domain_w_path) if not args.no_domain_ig else None
@@ -107,9 +110,11 @@ def select_LD_features(ig_lang, ig_domain, feats_per_lang, ignore_domain=False):
         writer.writerow(map(repr,features_per_lang[i]))
       
 
-  final_feature_set = reduce(set.union, map(set, features_per_lang.values()))
-  print 'selected %d features' % len(final_feature_set)
+  final_feature_set = reduce(set.union, map(set, features_per_lang.values()))
+  if not SILENT:
+    print 'selected %d features' % len(final_feature_set)
 
-  write_features(sorted(final_feature_set), feature_path)
-  print 'wrote features to "%s"' % feature_path 
+  write_features(sorted(final_feature_set), feature_path)
+  if not SILENT:
+    print 'wrote features to "%s"' % feature_path 
 
diff --git a/langid/train/NBtrain.py b/langid/train/NBtrain.py
index f24f688f..541cf421 100644
--- a/langid/train/NBtrain.py
+++ b/langid/train/NBtrain.py
@@ -35,6 +35,8 @@
 """
 MAX_CHUNK_SIZE = 100 # maximum number of files to tokenize at once
 NUM_BUCKETS = 64 # number of buckets to use in k-v pair generation
+
+SILENT = True 
 
 import base64, bz2, cPickle
 import os, sys, argparse, csv
@@ -189,16 +191,18 @@ def learn_ptc(paths, tk_nextmove, tk_output, cm, temp_path, args):
   with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f:
     pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg)
 
-  write_count = sum(pass_tokenize_out)
-  print "wrote a total of %d keys" % write_count
+  write_count = sum(pass_tokenize_out)
+  if not SILENT:
+    print "wrote a total of %d keys" % write_count
 
   pass_ptc_params = (cm, num_instances)
   with MapPool(args.jobs, setup_pass_ptc, pass_ptc_params) as f:
     pass_ptc_out = f(pass_ptc, b_dirs)
 
   reads, ids, prods = zip(*pass_ptc_out)
-  read_count = sum(reads)
-  print "read a total of %d keys (%d short)" % (read_count, write_count - read_count)
+  read_count = sum(reads)
+  if not SILENT:
+    print "read a total of %d keys (%d short)" % (read_count, write_count - read_count)
 
   prod = np.zeros((num_features, cm.shape[1]), dtype=int)
   prod[np.concatenate(ids)] = np.vstack(prods)
@@ -249,12 +253,13 @@ def cleanup():
   index_path = os.path.join(args.model, 'paths')
   lang_path = os.path.join(args.model, 'lang_index')
 
-  # display paths
-  print "model path:", args.model
-  print "temp path:", temp_path
-  print "scanner path:", scanner_path
-  #print "index path:", index_path
-  print "output path:", output_path
+  # display paths
+  if not SILENT:
+    print "model path:", args.model
+    print "temp path:", temp_path
+    print "scanner path:", scanner_path
+    print "index path:", index_path
+    print "output path:", output_path
 
   # read list of training files
   with open(index_path) as f:
@@ -281,5 +286,6 @@ def cleanup():
   model = nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output
   string = base64.b64encode(bz2.compress(cPickle.dumps(model)))
   with open(output_path, 'w') as f:
-    f.write(string)
-  print "wrote model to %s (%d bytes)" % (output_path, len(string))
+    f.write(string)
+  if not SILENT:  
+    print "wrote model to %s (%d bytes)" % (output_path, len(string))
diff --git a/langid/train/index.py b/langid/train/index.py
index 0b7cd7fb..563f6a56 100644
--- a/langid/train/index.py
+++ b/langid/train/index.py
@@ -59,7 +59,9 @@
 # Can be overriden with command-line options
 ######
 TRAIN_PROP = 1.0 # probability than any given document is selected
-MIN_DOMAIN = 1 # minimum number of domains a language must be present in to be included
+MIN_DOMAIN = 1 # minimum number of domains a language must be present in to be included
+
+SILENT = True
 
 import os, sys, argparse
 import csv
@@ -235,12 +237,13 @@ def paths(self):
   domains_path = os.path.join(model_dir, 'domain_index')
   index_path = os.path.join(model_dir, 'paths')
 
-  # display paths
-  print "corpus path:", args.corpus
-  print "model path:", model_dir
-  print "writing langs to:", langs_path
-  print "writing domains to:", domains_path
-  print "writing index to:", index_path
+  # display paths
+  if not SILENT:
+    print "corpus path:", args.corpus
+    print "model path:", model_dir
+    print "writing langs to:", langs_path
+    print "writing domains to:", domains_path
+    print "writing index to:", index_path
 
   indexer = CorpusIndexer(args.corpus, min_domain=args.min_domain, proportion=args.proportion,
                           langs = args.lang, domains = args.domain)
@@ -248,15 +251,18 @@ def paths(self):
   # Compute mappings between files, languages and domains
   lang_dist = indexer.dist_lang
   lang_index = indexer.lang_index
-  lang_info = ' '.join(("{0}({1})".format(k, lang_dist[v]) for k,v in lang_index.items()))
-  print "langs({0}): {1}".format(len(lang_dist), lang_info)
+  lang_info = ' '.join(("{0}({1})".format(k, lang_dist[v]) for k,v in lang_index.items()))
+  if not SILENT:
+    print "langs({0}): {1}".format(len(lang_dist), lang_info)
 
   domain_dist = indexer.dist_domain
   domain_index = indexer.domain_index
-  domain_info = ' '.join(("{0}({1})".format(k, domain_dist[v]) for k,v in domain_index.items()))
-  print "domains({0}): {1}".format(len(domain_dist), domain_info)
+  domain_info = ' '.join(("{0}({1})".format(k, domain_dist[v]) for k,v in domain_index.items()))
+  if not SILENT:
+    print "domains({0}): {1}".format(len(domain_dist), domain_info)
 
-  print "identified {0} files".format(len(indexer.items))
+  if not SILENT:
+    print "identified {0} files".format(len(indexer.items))
 
   # output the language index
   with open(langs_path,'w') as f:
diff --git a/langid/train/mytrain.py b/langid/train/mytrain.py
new file mode 100644
index 00000000..b1e461d8
--- /dev/null
+++ b/langid/train/mytrain.py
@@ -0,0 +1,47 @@
+import os
+import subprocess 
+
+CWD = os.getcwd()
+
+def main():
+
+    # Indexing
+    cmd = "python %s/langid/train/index.py -l en -l es -l ar -d gvo -d internet -d egypt %s/corpus-esaren" % (CWD, CWD)
+    subprocess.call(cmd, stdin=None, stdout=None, stderr=None, shell=True)
+    
+    # Tokenization
+    cmd = "python %s/langid/train/tokenize.py %s/corpus-esaren.model" % (CWD, CWD)
+    subprocess.call(cmd, stdin=None, stdout=None, stderr=None, shell=True)
+
+    cmd = "python %s/langid/train/DFfeatureselect.py %s/corpus-esaren.model" % (CWD, CWD)
+    subprocess.call(cmd, stdin=None, stdout=None, stderr=None, shell=True)
+
+    # Information Gain
+    cmd = "python %s/langid/train/IGweight.py -d %s/corpus-esaren.model" % (CWD, CWD)
+    subprocess.call(cmd, stdin=None, stdout=None, stderr=None, shell=True)
+
+    cmd = "python %s/langid/train/IGweight.py -lb %s/corpus-esaren.model" % (CWD, CWD)
+    subprocess.call(cmd, stdin=None, stdout=None, stderr=None, shell=True)
+    
+    # LD Featureselect
+    cmd = "python %s/langid/train/LDfeatureselect.py %s/corpus-esaren.model" % (CWD, CWD)
+    subprocess.call(cmd, stdin=None, stdout=None, stderr=None, shell=True)
+
+    # Scanner
+    cmd = "python %s/langid/train/scanner.py %s/corpus-esaren.model" % (CWD, CWD)
+    subprocess.call(cmd, stdin=None, stdout=None, stderr=None, shell=True)
+
+    
+    # NB Train
+    cmd = "python %s/langid/train/NBtrain.py %s/corpus-esaren.model" % (CWD, CWD)
+    subprocess.call(cmd, stdin=None, stdout=None, stderr=None, shell=True)
+
+       
+
+import cProfile
+import pstats
+cProfile.run('main()','train_prof')
+p_stats = pstats.Stats('train_prof')
+p_stats.sort_stats('time').print_stats(10)
+
+print 'Done'
\ No newline at end of file
diff --git a/langid/train/scanner.py b/langid/train/scanner.py
index 838b54f4..d7054945 100644
--- a/langid/train/scanner.py
+++ b/langid/train/scanner.py
@@ -39,7 +39,9 @@
 import os, sys, argparse 
 import array
 from collections import deque, defaultdict
-from common import read_features
+from common import read_features
+
+SILENT = True
 
 class Scanner(object):
   alphabet = map(chr, range(1<<8))
@@ -189,8 +191,9 @@ def build_scanner(features):
   """
   feat_index = index(features)
 
-  # Build the actual scanner
-  print "building scanner"
+  # Build the actual scanner
+  if not SILENT:
+    print "building scanner"
   scanner = Scanner(features)
   tk_nextmove, raw_output = scanner.__getstate__()
 
@@ -227,14 +230,16 @@ def index(seq):
   else:
     output_path = input_path + '.scanner'
 
-  # display paths
-  print "input path:", input_path
-  print "output path:", output_path
+  # display paths
+  if not SILENT:
+    print "input path:", input_path
+    print "output path:", output_path
 
   nb_features = read_features(input_path)
   tk_nextmove, tk_output = build_scanner(nb_features)
   scanner = tk_nextmove, tk_output, nb_features
 
   with open(output_path, 'w') as f:
-    cPickle.dump(scanner, f)
-  print "wrote scanner to {0}".format(output_path)
+    cPickle.dump(scanner, f)
+  if not SILENT:
+    print "wrote scanner to {0}".format(output_path)
diff --git a/langid/train/tokenize.py b/langid/train/tokenize.py
index 70111e14..b9d530ff 100644
--- a/langid/train/tokenize.py
+++ b/langid/train/tokenize.py
@@ -41,7 +41,9 @@
 MAX_NGRAM_ORDER = 4 # largest order of n-grams to consider
 TOP_DOC_FREQ = 15000 # number of tokens to consider for each order
 NUM_BUCKETS = 64 # number of buckets to use in k-v pair generation
-CHUNKSIZE = 50 # maximum size of chunk (number of files tokenized - less = less memory use)
+CHUNKSIZE = 50 # maximum size of chunk (number of files tokenized - less = less memory use)
+
+SILENT = True
 
 import os, sys, argparse
 import csv
@@ -195,17 +197,21 @@ def build_index(items, tokenizer, outdir, buckets=NUM_BUCKETS, jobs=None, chunks
 
 
     doc_count = defaultdict(int)
-    chunk_count = len(item_chunks)
-    print "chunk size: {0} ({1} chunks)".format(chunk_size, chunk_count)
-    print "job count: {0}".format(jobs)
-
-    if sample_count:
-      print "sampling-based tokenization: size {0} count {1}".format(sample_size, sample_count)
-    else:
-      print "whole-document tokenization"
-
-    for i, keycount in enumerate(pass_tokenize_out):
-      print "tokenized chunk (%d/%d) [%d keys]" % (i+1,chunk_count, keycount)
+    chunk_count = len(item_chunks)
+    if not SILENT:
+      print "chunk size: {0} ({1} chunks)".format(chunk_size, chunk_count)
+      print "job count: {0}".format(jobs)
+
+    if sample_count:
+      if not SILENT:
+        print "sampling-based tokenization: size {0} count {1}".format(sample_size, sample_count)
+    else:
+      if not SILENT:
+        print "whole-document tokenization"
+
+    for i, keycount in enumerate(pass_tokenize_out):
+      if not SILENT:
+        print "tokenized chunk (%d/%d) [%d keys]" % (i+1,chunk_count, keycount)
 
   complete = True
 
@@ -241,10 +247,11 @@ def build_index(items, tokenizer, outdir, buckets=NUM_BUCKETS, jobs=None, chunks
   bucketlist_path = args.output if args.output else os.path.join(args.model, 'bucketlist')
   index_path = os.path.join(args.model, 'paths')
 
-  # display paths
-  print "index path:", index_path
-  print "bucketlist path:", bucketlist_path
-  print "buckets path:", buckets_dir
+  # display paths
+  if not SILENT:
+    print "index path:", index_path
+    print "bucketlist path:", bucketlist_path
+    print "buckets path:", buckets_dir
 
   with open(index_path) as f:
     reader = csv.reader(f)
@@ -253,24 +260,30 @@ def build_index(items, tokenizer, outdir, buckets=NUM_BUCKETS, jobs=None, chunks
   if sum(map(bool,(args.scanner, args.max_order, args.word))) > 1:
     parser.error('can only specify one of --word, --scanner and --max_order')
 
-  # Tokenize
-  print "will tokenize %d files" % len(items)
+  # Tokenize
+  if not SILENT:
+    print "will tokenize %d files" % len(items)
   if args.scanner:
     from scanner import Scanner
-    tokenizer = Scanner.from_file(args.scanner)
-    print "using provided scanner: ", args.scanner
+    tokenizer = Scanner.from_file(args.scanner)
+    if not SILENT:
+      print "using provided scanner: ", args.scanner
   elif args.word:
-    tokenizer = str.split
-    print "using str.split to tokenize"
+    tokenizer = str.split
+    if not SILENT:
+      print "using str.split to tokenize"
   else:
     min_order = args.min_order if args.min_order else MIN_NGRAM_ORDER
     max_order = args.max_order if args.max_order else MAX_NGRAM_ORDER
-    tokenizer = NGramTokenizer(min_order,max_order)
-    print "using n-gram tokenizer: min_order({0}) max_order({1})".format(min_order,max_order)
-  if args.term_freq:
-    print "counting term frequency"
-  else:
-    print "counting document frequency"
+    tokenizer = NGramTokenizer(min_order,max_order)
+    if not SILENT:
+      print "using n-gram tokenizer: min_order({0}) max_order({1})".format(min_order,max_order)
+  if args.term_freq:
+    if not SILENT:
+      print "counting term frequency"
+  else:
+    if not SILENT:
+      print "counting document frequency"
   b_dirs = build_index(items, tokenizer, buckets_dir, args.buckets, args.jobs, args.chunksize, args.sample_count, args.sample_size, args.term_freq)
 
   # output the paths to the buckets
diff --git a/langid/train/train.py b/langid/train/train.py
index 85231ea0..30b62945 100644
--- a/langid/train/train.py
+++ b/langid/train/train.py
@@ -229,6 +229,7 @@
     ig_vals = {}
     for label, dist, suffix, binarize in ig_params:
       print "Computing information gain for {0}".format(label)
+      print 'b_dirs:', b_dirs
       ig = compute_IG(b_dirs, DFfeats, dist, binarize, suffix, args.jobs)
       if args.debug:
         weights_path = os.path.join(model_dir, 'IGweights' + suffix + ('.bin' if binarize else ''))

From c67aafe1bb7d39f66469ef53b7edc74b9ca7c401 Mon Sep 17 00:00:00 2001
From: Tarek Amr <gr33ndata@yahoo.com>
Date: Sun, 9 Mar 2014 13:26:28 +0200
Subject: [PATCH 4/5] Testing script for the 3 languages, test.py

---
 langid/__init__.py       |  2 +-
 langid/langid.py         |  8 +++++-
 langid/train/IGweight.py |  4 +--
 test.py                  | 56 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 66 insertions(+), 4 deletions(-)
 create mode 100644 test.py

diff --git a/langid/__init__.py b/langid/__init__.py
index d6a60148..4146f620 100644
--- a/langid/__init__.py
+++ b/langid/__init__.py
@@ -1 +1 @@
-from langid import classify, rank, set_languages
+from langid import classify, rank, set_languages, load_model
diff --git a/langid/langid.py b/langid/langid.py
index fe76893a..ccf01ae6 100644
--- a/langid/langid.py
+++ b/langid/langid.py
@@ -38,6 +38,8 @@
 FORCE_WSGIREF = False
 NORM_PROBS = True # Normalize optput probabilities.
 
+SILENT = True
+
 # NORM_PROBS can be set to False for a small speed increase. It does not
 # affect the relative ordering of the predicted classes. 
 
@@ -102,7 +104,7 @@ def rank(instance):
   global identifier
   if identifier is None:
     load_model()
-
+  
   return identifier.rank(instance)
   
 def cl_path(path):
@@ -156,6 +158,8 @@ class LanguageIdentifier(object):
 
   @classmethod
   def from_modelstring(cls, string, *args, **kwargs):
+    if not SILENT:
+      print 'Loading model from string'
     model = loads(bz2.decompress(base64.b64decode(string)))
     nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = model
     nb_numfeats = len(nb_ptc) / len(nb_pc)
@@ -168,6 +172,8 @@ def from_modelstring(cls, string, *args, **kwargs):
 
   @classmethod
   def from_modelpath(cls, path, *args, **kwargs):
+    if not SILENT:
+      print 'Loading model from path'
     with open(path) as f:
       return cls.from_modelstring(f.read(), *args, **kwargs)
 
diff --git a/langid/train/IGweight.py b/langid/train/IGweight.py
index 5a11658e..69e886b4 100644
--- a/langid/train/IGweight.py
+++ b/langid/train/IGweight.py
@@ -242,8 +242,8 @@ def read_dist(path):
     print "features path:", feature_path
     print "weights path:", weights_path
     print "index path:", index_path
-    print "suffix:", suffix
-
+    print "suffix:", suffix
+  
   if not SILENT:  
     print "computing information gain"
   # Compile buckets together
diff --git a/test.py b/test.py
new file mode 100644
index 00000000..4cfc98eb
--- /dev/null
+++ b/test.py
@@ -0,0 +1,56 @@
+import langid
+import os
+
+test_path = 'corpus-esaren.train'
+
+langid.load_model(path="corpus-esaren.model/model")
+
+class Accuracy:
+
+    def __init__(self):
+        self.correct = 0
+        self.incorrect = 0
+
+    def update(self, correct=True):
+        if correct:
+            self.correct += 1
+        else:
+            self.incorrect += 1
+        #print 'updates', self.correct, self.incorrect
+
+    def evaluate(self):
+        total_cases = self.correct + self.incorrect
+        accuracy = self.correct * 100.0 / total_cases
+        print 'Accuracy = %f %% (of %d test cases)' % (accuracy, total_cases)
+
+def visit(arg, dirname, names):
+    path = dirname.split('/')
+
+    if len(path) == 1:
+        #print names
+        for i in range(len(names)-1,0,-1):
+            if names[i].startswith('.'):
+                del names[i]
+        #print names
+    else:
+        lang = path[1]
+        #print arg, dirname, names
+        for name in names:
+            fd = open(dirname + '/' + name,'r')
+            for line in fd.readlines():
+                res = langid.classify(line)
+                #print lang, ':', res
+                if lang == res[0]:
+                    a.update(correct=True)
+                else:
+                    #print 'incorrect:', lang, res
+                    a.update(correct=False)
+            fd.close()
+
+    
+a = Accuracy()
+os.path.walk(test_path, visit, '')
+a.evaluate()
+
+#res = langid.classify("This is a test")
+#print res
\ No newline at end of file

From fc3899c086b88c58c8fb745954115630845d4888 Mon Sep 17 00:00:00 2001
From: Tarek Amr <gr33ndata@yahoo.com>
Date: Sun, 9 Mar 2014 14:03:52 +0200
Subject: [PATCH 5/5] Moving testing module to corpuslib

---
 corpuslib/__init__.py |  1 +
 corpuslib/test.py     | 54 +++++++++++++++++++++++++++++++++++++++++++
 corpuslib/train.py    |  0
 test.py               | 49 ++++-----------------------------------
 4 files changed, 60 insertions(+), 44 deletions(-)
 create mode 100644 corpuslib/__init__.py
 create mode 100644 corpuslib/test.py
 create mode 100644 corpuslib/train.py

diff --git a/corpuslib/__init__.py b/corpuslib/__init__.py
new file mode 100644
index 00000000..e419f7af
--- /dev/null
+++ b/corpuslib/__init__.py
@@ -0,0 +1 @@
+from test import Accuracy, Test
\ No newline at end of file
diff --git a/corpuslib/test.py b/corpuslib/test.py
new file mode 100644
index 00000000..cfe8a8da
--- /dev/null
+++ b/corpuslib/test.py
@@ -0,0 +1,54 @@
+import os
+
+class Test:
+
+    def __init__(self, root='', langid=None, accuracy=None):
+        self.root = root
+        self.langid = langid
+        self.a = accuracy
+
+    def visit(self, arg, dirname, names):
+        path = dirname.split('/')
+
+        if len(path) == 1:
+            #print names
+            for i in range(len(names)-1,0,-1):
+                if names[i].startswith('.'):
+                    del names[i]
+            #print names
+        else:
+            lang = path[1]
+            #print arg, dirname, names
+            for name in names:
+                fd = open(dirname + '/' + name,'r')
+                for line in fd.readlines():
+                    res = self.langid.classify(line)
+                    #print lang, ':', res
+                    if lang == res[0]:
+                        self.a.update(correct=True)
+                    else:
+                        #print 'incorrect:', lang, res
+                        self.a.update(correct=False)
+                fd.close()
+
+    def start(self):
+        os.path.walk(self.root, self.visit, '')    
+
+class Accuracy:
+
+    def __init__(self):
+        self.correct = 0
+        self.incorrect = 0
+
+    def update(self, correct=True):
+        if correct:
+            self.correct += 1
+        else:
+            self.incorrect += 1
+        #print 'updates', self.correct, self.incorrect
+
+    def evaluate(self):
+        total_cases = self.correct + self.incorrect
+        accuracy = self.correct * 100.0 / total_cases
+        print 'Accuracy = %f %% (of %d test cases)' % (accuracy, total_cases)
+
diff --git a/corpuslib/train.py b/corpuslib/train.py
new file mode 100644
index 00000000..e69de29b
diff --git a/test.py b/test.py
index 4cfc98eb..81366186 100644
--- a/test.py
+++ b/test.py
@@ -1,55 +1,16 @@
 import langid
 import os
 
+import corpuslib  
+
 test_path = 'corpus-esaren.train'
 
 langid.load_model(path="corpus-esaren.model/model")
 
-class Accuracy:
-
-    def __init__(self):
-        self.correct = 0
-        self.incorrect = 0
-
-    def update(self, correct=True):
-        if correct:
-            self.correct += 1
-        else:
-            self.incorrect += 1
-        #print 'updates', self.correct, self.incorrect
-
-    def evaluate(self):
-        total_cases = self.correct + self.incorrect
-        accuracy = self.correct * 100.0 / total_cases
-        print 'Accuracy = %f %% (of %d test cases)' % (accuracy, total_cases)
-
-def visit(arg, dirname, names):
-    path = dirname.split('/')
-
-    if len(path) == 1:
-        #print names
-        for i in range(len(names)-1,0,-1):
-            if names[i].startswith('.'):
-                del names[i]
-        #print names
-    else:
-        lang = path[1]
-        #print arg, dirname, names
-        for name in names:
-            fd = open(dirname + '/' + name,'r')
-            for line in fd.readlines():
-                res = langid.classify(line)
-                #print lang, ':', res
-                if lang == res[0]:
-                    a.update(correct=True)
-                else:
-                    #print 'incorrect:', lang, res
-                    a.update(correct=False)
-            fd.close()
-
     
-a = Accuracy()
-os.path.walk(test_path, visit, '')
+a = corpuslib.Accuracy()
+t = corpuslib.Test(test_path, langid=langid, accuracy=a)
+t.start()
 a.evaluate()
 
 #res = langid.classify("This is a test")