From a7e1e9c53ec028bc6ace80ee6e4a90bfce1a3282 Mon Sep 17 00:00:00 2001 From: Tarek Amr Date: Wed, 5 Mar 2014 02:57:23 +0200 Subject: [PATCH 1/5] Fixing gzip problem in NBtrain.py, by doing some changes in it and in common.py as well --- langid/train/NBtrain.py | 2 +- langid/train/common.py | 28 +++++++++++++++++++--------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/langid/train/NBtrain.py b/langid/train/NBtrain.py index 75b6f6e6..f24f688f 100644 --- a/langid/train/NBtrain.py +++ b/langid/train/NBtrain.py @@ -131,7 +131,7 @@ def pass_ptc(b_dir): read_count = 0 for path in os.listdir(b_dir): if path.endswith('.index'): - for f_id, doc_id, count in unmarshal_iter(os.path.join(b_dir, path)): + for f_id, doc_id, count in unmarshal_iter(os.path.join(b_dir, path),do_gzip=False): terms[f_id][doc_id] = count read_count += 1 diff --git a/langid/train/common.py b/langid/train/common.py index 8a31b656..c1880f74 100644 --- a/langid/train/common.py +++ b/langid/train/common.py @@ -32,18 +32,28 @@ def chunk(seq, chunksize): if not chunk: break yield chunk -def unmarshal_iter(path): +def unmarshal_iter(path, do_gzip=True): """ Open a given path and yield an iterator over items unmarshalled from it. """ - with gzip.open(path, 'rb') as f, tempfile.TemporaryFile() as t: - t.write(f.read()) - t.seek(0) - while True: - try: - yield marshal.load(t) - except EOFError: - break + if do_gzip: + with gzip.open(path, 'rb') as f, tempfile.TemporaryFile() as t: + t.write(f.read()) + t.seek(0) + while True: + try: + yield marshal.load(t) + except EOFError: + break + else: + with open(path, 'rb') as f, tempfile.TemporaryFile() as t: + t.write(f.read()) + t.seek(0) + while True: + try: + yield marshal.load(t) + except EOFError: + break import os, errno def makedir(path): From e7924010b05141f3667e45d67d5137389c73c8ad Mon Sep 17 00:00:00 2001 From: Tarek Amr Date: Wed, 5 Mar 2014 17:40:17 +0200 Subject: [PATCH 2/5] Adding the ability to skip hidden files starting with dot in their names --- langid/train/index.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/langid/train/index.py b/langid/train/index.py index 138e84ab..0b7cd7fb 100644 --- a/langid/train/index.py +++ b/langid/train/index.py @@ -74,10 +74,11 @@ class CorpusIndexer(object): """ Class to index the contents of a corpus """ - def __init__(self, root, min_domain=MIN_DOMAIN, proportion=TRAIN_PROP, langs=None, domains=None): + def __init__(self, root, min_domain=MIN_DOMAIN, proportion=TRAIN_PROP, langs=None, domains=None, ignore_hidden_files=True): self.root = root self.min_domain = min_domain - self.proportion = proportion + self.proportion = proportion + self.ignore_hidden_files = ignore_hidden_files if langs is None: self.lang_index = defaultdict(Enumerator()) @@ -102,8 +103,10 @@ def index(self, root): # root supplied was the root of a directory structure candidates = [] for dirpath, dirnames, filenames in os.walk(root, followlinks=True): - for docname in filenames: - candidates.append((dirpath, docname)) + for docname in filenames: + # Ignore hidden files starting with dot if told to do so + if self.ignore_hidden_files and not docname.startswith('.'): + candidates.append((dirpath, docname)) else: # root supplied was a file, interpet as list of paths candidates = [os.path.split(str.strip(l)) for l in open(root)] From dd191b4d7edd0c505ec6f1195911a4198bb059a1 Mon Sep 17 00:00:00 2001 From: Tarek Amr Date: Thu, 6 Mar 2014 14:23:18 +0200 Subject: [PATCH 3/5] Adding mytrain.py to do all training at once, also edited other training components to make them silent, no print outs --- langid/train/DFfeatureselect.py | 48 ++++++++++++++--------- langid/train/IGweight.py | 42 ++++++++++++-------- langid/train/LDfeatureselect.py | 25 +++++++----- langid/train/NBtrain.py | 30 ++++++++------ langid/train/index.py | 30 ++++++++------ langid/train/mytrain.py | 47 ++++++++++++++++++++++ langid/train/scanner.py | 21 ++++++---- langid/train/tokenize.py | 69 ++++++++++++++++++++------------- langid/train/train.py | 1 + 9 files changed, 208 insertions(+), 105 deletions(-) create mode 100644 langid/train/mytrain.py diff --git a/langid/train/DFfeatureselect.py b/langid/train/DFfeatureselect.py index ea69c4d0..a5007568 100644 --- a/langid/train/DFfeatureselect.py +++ b/langid/train/DFfeatureselect.py @@ -38,7 +38,9 @@ # Can be overriden with command-line options ###### MAX_NGRAM_ORDER = 4 # largest order of n-grams to consider -TOKENS_PER_ORDER = 15000 # number of tokens to consider for each order +TOKENS_PER_ORDER = 15000 # number of tokens to consider for each order + +SILENT = True import os, sys, argparse import collections @@ -88,8 +90,9 @@ def tally(bucketlist, jobs=None): with MapPool(jobs) as f: pass_sum_df_out = f(pass_sum_df, bucketlist) - for i, keycount in enumerate(pass_sum_df_out): - print "processed bucket (%d/%d) [%d keys]" % (i+1, len(bucketlist), keycount) + for i, keycount in enumerate(pass_sum_df_out): + if not SILENT: + print "processed bucket (%d/%d) [%d keys]" % (i+1, len(bucketlist), keycount) # build the global term->df mapping doc_count = {} @@ -146,35 +149,42 @@ def ngram_select(doc_count, max_order=MAX_NGRAM_ORDER, tokens_per_order=TOKENS_P else: bucketlist_path = os.path.join(args.model, 'bucketlist') - # display paths - print "buckets path:", bucketlist_path - print "features output path:", feature_path - if args.tokens_per_order: - print "max ngram order:", args.max_order - print "tokens per order:", args.tokens_per_order - else: - print "tokens:", args.tokens + # display paths + if not SILENT: + print "buckets path:", bucketlist_path + print "features output path:", feature_path + if args.tokens_per_order: + if not SILENT: + print "max ngram order:", args.max_order + print "tokens per order:", args.tokens_per_order + else: + if not SILENT: + print "tokens:", args.tokens with open(bucketlist_path) as f: bucketlist = map(str.strip, f) - doc_count = tally(bucketlist, args.jobs) - print "unique features:", len(doc_count) + doc_count = tally(bucketlist, args.jobs) + if not SILENT: + print "unique features:", len(doc_count) if args.doc_count: # The constant true is used to indicate output to default location doc_count_path = os.path.join(args.model, 'DF_all') if args.doc_count == True else args.doc_count - write_weights(doc_count, doc_count_path) - print "wrote DF counts for all features to:", doc_count_path + write_weights(doc_count, doc_count_path) + if not SILENT: + print "wrote DF counts for all features to:", doc_count_path if args.tokens_per_order: # Choose a number of features for each length of token feats = ngram_select(doc_count, args.max_order, args.tokens_per_order) else: # Choose a number of features overall - feats = sorted( sorted(doc_count, key=doc_count.get, reverse=True)[:args.tokens] ) - print "selected features: ", len(feats) + feats = sorted( sorted(doc_count, key=doc_count.get, reverse=True)[:args.tokens] ) + if not SILENT: + print "selected features: ", len(feats) - write_features(feats, feature_path) - print 'wrote features to "%s"' % feature_path + write_features(feats, feature_path) + if not SILENT: + print 'wrote features to "%s"' % feature_path diff --git a/langid/train/IGweight.py b/langid/train/IGweight.py index 4f59936d..5a11658e 100644 --- a/langid/train/IGweight.py +++ b/langid/train/IGweight.py @@ -32,7 +32,9 @@ The views and conclusions contained in the software and documentation are those of the authors and should not be interpreted as representing official policies, either expressed or implied, of the copyright holder. -""" +""" + +SILENT = True import os, sys, argparse import csv @@ -89,15 +91,18 @@ def pass_IG(buckets): @param buckets a list of buckets. Each bucket must be a directory that contains files with the appropriate suffix. Each file must contain marshalled (term, event_id, count) triplets. - """ + """ + global __features, __dist, __binarize, __suffix # We first tally the per-event frequency of each # term in our selected feature set. term_freq = defaultdict(lambda: defaultdict(int)) term_index = defaultdict(Enumerator()) - - for bucket in buckets: + + for bucket in buckets: + if not SILENT: + print 'bucket:', bucket for path in os.listdir(bucket): if path.endswith(__suffix): for key, event_id, count in unmarshal_iter(os.path.join(bucket,path)): @@ -165,8 +170,9 @@ def compute_IG(bucketlist, features, dist, binarize, suffix, job_count=None): for i, (t, w) in enumerate(pass_IG_out): weights.append(w) - terms.extend(t) - print "processed chunk (%d/%d) [%d terms]" % (i+1, num_chunk, len(t)) + terms.extend(t) + if not SILENT: + print "processed chunk (%d/%d) [%d terms]" % (i+1, num_chunk, len(t)) if binarize: weights = numpy.hstack(weights).transpose() @@ -229,22 +235,26 @@ def read_dist(path): else: weights_path = os.path.join(args.model, 'IGweights' + suffix + ('.bin' if args.binarize else '')) - # display paths - print "model path:", args.model - print "buckets path:", bucketlist_paths - print "features path:", feature_path - print "weights path:", weights_path - print "index path:", index_path - print "suffix:", suffix - - print "computing information gain" + # display paths + if not SILENT: + print "model path:", args.model + print "buckets path:", bucketlist_paths + print "features path:", feature_path + print "weights path:", weights_path + print "index path:", index_path + print "suffix:", suffix + + if not SILENT: + print "computing information gain" # Compile buckets together bucketlist = zip(*(map(str.strip, open(p)) for p in bucketlist_paths)) # Check that each bucketlist has the same number of buckets assert len(set(map(len,bucketlist))) == 1, "incompatible bucketlists!" - dist = read_dist(index_path) + dist = read_dist(index_path) + if not SILENT: + print 'bucketlist:', bucketlist ig = compute_IG(bucketlist, features, dist, args.binarize, suffix, args.jobs) write_weights(ig, weights_path) diff --git a/langid/train/LDfeatureselect.py b/langid/train/LDfeatureselect.py index d8c11ee4..4c5837e0 100644 --- a/langid/train/LDfeatureselect.py +++ b/langid/train/LDfeatureselect.py @@ -37,7 +37,9 @@ # Default values # Can be overriden with command-line options ###### -FEATURES_PER_LANG = 300 # number of features to select for each language +FEATURES_PER_LANG = 300 # number of features to select for each language + +SILENT = True import os, sys, argparse import csv @@ -90,11 +92,12 @@ def select_LD_features(ig_lang, ig_domain, feats_per_lang, ignore_domain=False): domain_w_path = os.path.join(args.model, 'IGweights.domain') feature_path = args.output if args.output else os.path.join(args.model, 'LDfeats') - # display paths - print "model path:", args.model - print "lang weights path:", lang_w_path - print "domain weights path:", domain_w_path - print "feature output path:", feature_path + # display paths + if not SILENT: + print "model path:", args.model + print "lang weights path:", lang_w_path + print "domain weights path:", domain_w_path + print "feature output path:", feature_path lang_w = read_weights(lang_w_path) domain_w = read_weights(domain_w_path) if not args.no_domain_ig else None @@ -107,9 +110,11 @@ def select_LD_features(ig_lang, ig_domain, feats_per_lang, ignore_domain=False): writer.writerow(map(repr,features_per_lang[i])) - final_feature_set = reduce(set.union, map(set, features_per_lang.values())) - print 'selected %d features' % len(final_feature_set) + final_feature_set = reduce(set.union, map(set, features_per_lang.values())) + if not SILENT: + print 'selected %d features' % len(final_feature_set) - write_features(sorted(final_feature_set), feature_path) - print 'wrote features to "%s"' % feature_path + write_features(sorted(final_feature_set), feature_path) + if not SILENT: + print 'wrote features to "%s"' % feature_path diff --git a/langid/train/NBtrain.py b/langid/train/NBtrain.py index f24f688f..541cf421 100644 --- a/langid/train/NBtrain.py +++ b/langid/train/NBtrain.py @@ -35,6 +35,8 @@ """ MAX_CHUNK_SIZE = 100 # maximum number of files to tokenize at once NUM_BUCKETS = 64 # number of buckets to use in k-v pair generation + +SILENT = True import base64, bz2, cPickle import os, sys, argparse, csv @@ -189,16 +191,18 @@ def learn_ptc(paths, tk_nextmove, tk_output, cm, temp_path, args): with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f: pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg) - write_count = sum(pass_tokenize_out) - print "wrote a total of %d keys" % write_count + write_count = sum(pass_tokenize_out) + if not SILENT: + print "wrote a total of %d keys" % write_count pass_ptc_params = (cm, num_instances) with MapPool(args.jobs, setup_pass_ptc, pass_ptc_params) as f: pass_ptc_out = f(pass_ptc, b_dirs) reads, ids, prods = zip(*pass_ptc_out) - read_count = sum(reads) - print "read a total of %d keys (%d short)" % (read_count, write_count - read_count) + read_count = sum(reads) + if not SILENT: + print "read a total of %d keys (%d short)" % (read_count, write_count - read_count) prod = np.zeros((num_features, cm.shape[1]), dtype=int) prod[np.concatenate(ids)] = np.vstack(prods) @@ -249,12 +253,13 @@ def cleanup(): index_path = os.path.join(args.model, 'paths') lang_path = os.path.join(args.model, 'lang_index') - # display paths - print "model path:", args.model - print "temp path:", temp_path - print "scanner path:", scanner_path - #print "index path:", index_path - print "output path:", output_path + # display paths + if not SILENT: + print "model path:", args.model + print "temp path:", temp_path + print "scanner path:", scanner_path + print "index path:", index_path + print "output path:", output_path # read list of training files with open(index_path) as f: @@ -281,5 +286,6 @@ def cleanup(): model = nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output string = base64.b64encode(bz2.compress(cPickle.dumps(model))) with open(output_path, 'w') as f: - f.write(string) - print "wrote model to %s (%d bytes)" % (output_path, len(string)) + f.write(string) + if not SILENT: + print "wrote model to %s (%d bytes)" % (output_path, len(string)) diff --git a/langid/train/index.py b/langid/train/index.py index 0b7cd7fb..563f6a56 100644 --- a/langid/train/index.py +++ b/langid/train/index.py @@ -59,7 +59,9 @@ # Can be overriden with command-line options ###### TRAIN_PROP = 1.0 # probability than any given document is selected -MIN_DOMAIN = 1 # minimum number of domains a language must be present in to be included +MIN_DOMAIN = 1 # minimum number of domains a language must be present in to be included + +SILENT = True import os, sys, argparse import csv @@ -235,12 +237,13 @@ def paths(self): domains_path = os.path.join(model_dir, 'domain_index') index_path = os.path.join(model_dir, 'paths') - # display paths - print "corpus path:", args.corpus - print "model path:", model_dir - print "writing langs to:", langs_path - print "writing domains to:", domains_path - print "writing index to:", index_path + # display paths + if not SILENT: + print "corpus path:", args.corpus + print "model path:", model_dir + print "writing langs to:", langs_path + print "writing domains to:", domains_path + print "writing index to:", index_path indexer = CorpusIndexer(args.corpus, min_domain=args.min_domain, proportion=args.proportion, langs = args.lang, domains = args.domain) @@ -248,15 +251,18 @@ def paths(self): # Compute mappings between files, languages and domains lang_dist = indexer.dist_lang lang_index = indexer.lang_index - lang_info = ' '.join(("{0}({1})".format(k, lang_dist[v]) for k,v in lang_index.items())) - print "langs({0}): {1}".format(len(lang_dist), lang_info) + lang_info = ' '.join(("{0}({1})".format(k, lang_dist[v]) for k,v in lang_index.items())) + if not SILENT: + print "langs({0}): {1}".format(len(lang_dist), lang_info) domain_dist = indexer.dist_domain domain_index = indexer.domain_index - domain_info = ' '.join(("{0}({1})".format(k, domain_dist[v]) for k,v in domain_index.items())) - print "domains({0}): {1}".format(len(domain_dist), domain_info) + domain_info = ' '.join(("{0}({1})".format(k, domain_dist[v]) for k,v in domain_index.items())) + if not SILENT: + print "domains({0}): {1}".format(len(domain_dist), domain_info) - print "identified {0} files".format(len(indexer.items)) + if not SILENT: + print "identified {0} files".format(len(indexer.items)) # output the language index with open(langs_path,'w') as f: diff --git a/langid/train/mytrain.py b/langid/train/mytrain.py new file mode 100644 index 00000000..b1e461d8 --- /dev/null +++ b/langid/train/mytrain.py @@ -0,0 +1,47 @@ +import os +import subprocess + +CWD = os.getcwd() + +def main(): + + # Indexing + cmd = "python %s/langid/train/index.py -l en -l es -l ar -d gvo -d internet -d egypt %s/corpus-esaren" % (CWD, CWD) + subprocess.call(cmd, stdin=None, stdout=None, stderr=None, shell=True) + + # Tokenization + cmd = "python %s/langid/train/tokenize.py %s/corpus-esaren.model" % (CWD, CWD) + subprocess.call(cmd, stdin=None, stdout=None, stderr=None, shell=True) + + cmd = "python %s/langid/train/DFfeatureselect.py %s/corpus-esaren.model" % (CWD, CWD) + subprocess.call(cmd, stdin=None, stdout=None, stderr=None, shell=True) + + # Information Gain + cmd = "python %s/langid/train/IGweight.py -d %s/corpus-esaren.model" % (CWD, CWD) + subprocess.call(cmd, stdin=None, stdout=None, stderr=None, shell=True) + + cmd = "python %s/langid/train/IGweight.py -lb %s/corpus-esaren.model" % (CWD, CWD) + subprocess.call(cmd, stdin=None, stdout=None, stderr=None, shell=True) + + # LD Featureselect + cmd = "python %s/langid/train/LDfeatureselect.py %s/corpus-esaren.model" % (CWD, CWD) + subprocess.call(cmd, stdin=None, stdout=None, stderr=None, shell=True) + + # Scanner + cmd = "python %s/langid/train/scanner.py %s/corpus-esaren.model" % (CWD, CWD) + subprocess.call(cmd, stdin=None, stdout=None, stderr=None, shell=True) + + + # NB Train + cmd = "python %s/langid/train/NBtrain.py %s/corpus-esaren.model" % (CWD, CWD) + subprocess.call(cmd, stdin=None, stdout=None, stderr=None, shell=True) + + + +import cProfile +import pstats +cProfile.run('main()','train_prof') +p_stats = pstats.Stats('train_prof') +p_stats.sort_stats('time').print_stats(10) + +print 'Done' \ No newline at end of file diff --git a/langid/train/scanner.py b/langid/train/scanner.py index 838b54f4..d7054945 100644 --- a/langid/train/scanner.py +++ b/langid/train/scanner.py @@ -39,7 +39,9 @@ import os, sys, argparse import array from collections import deque, defaultdict -from common import read_features +from common import read_features + +SILENT = True class Scanner(object): alphabet = map(chr, range(1<<8)) @@ -189,8 +191,9 @@ def build_scanner(features): """ feat_index = index(features) - # Build the actual scanner - print "building scanner" + # Build the actual scanner + if not SILENT: + print "building scanner" scanner = Scanner(features) tk_nextmove, raw_output = scanner.__getstate__() @@ -227,14 +230,16 @@ def index(seq): else: output_path = input_path + '.scanner' - # display paths - print "input path:", input_path - print "output path:", output_path + # display paths + if not SILENT: + print "input path:", input_path + print "output path:", output_path nb_features = read_features(input_path) tk_nextmove, tk_output = build_scanner(nb_features) scanner = tk_nextmove, tk_output, nb_features with open(output_path, 'w') as f: - cPickle.dump(scanner, f) - print "wrote scanner to {0}".format(output_path) + cPickle.dump(scanner, f) + if not SILENT: + print "wrote scanner to {0}".format(output_path) diff --git a/langid/train/tokenize.py b/langid/train/tokenize.py index 70111e14..b9d530ff 100644 --- a/langid/train/tokenize.py +++ b/langid/train/tokenize.py @@ -41,7 +41,9 @@ MAX_NGRAM_ORDER = 4 # largest order of n-grams to consider TOP_DOC_FREQ = 15000 # number of tokens to consider for each order NUM_BUCKETS = 64 # number of buckets to use in k-v pair generation -CHUNKSIZE = 50 # maximum size of chunk (number of files tokenized - less = less memory use) +CHUNKSIZE = 50 # maximum size of chunk (number of files tokenized - less = less memory use) + +SILENT = True import os, sys, argparse import csv @@ -195,17 +197,21 @@ def build_index(items, tokenizer, outdir, buckets=NUM_BUCKETS, jobs=None, chunks doc_count = defaultdict(int) - chunk_count = len(item_chunks) - print "chunk size: {0} ({1} chunks)".format(chunk_size, chunk_count) - print "job count: {0}".format(jobs) - - if sample_count: - print "sampling-based tokenization: size {0} count {1}".format(sample_size, sample_count) - else: - print "whole-document tokenization" - - for i, keycount in enumerate(pass_tokenize_out): - print "tokenized chunk (%d/%d) [%d keys]" % (i+1,chunk_count, keycount) + chunk_count = len(item_chunks) + if not SILENT: + print "chunk size: {0} ({1} chunks)".format(chunk_size, chunk_count) + print "job count: {0}".format(jobs) + + if sample_count: + if not SILENT: + print "sampling-based tokenization: size {0} count {1}".format(sample_size, sample_count) + else: + if not SILENT: + print "whole-document tokenization" + + for i, keycount in enumerate(pass_tokenize_out): + if not SILENT: + print "tokenized chunk (%d/%d) [%d keys]" % (i+1,chunk_count, keycount) complete = True @@ -241,10 +247,11 @@ def build_index(items, tokenizer, outdir, buckets=NUM_BUCKETS, jobs=None, chunks bucketlist_path = args.output if args.output else os.path.join(args.model, 'bucketlist') index_path = os.path.join(args.model, 'paths') - # display paths - print "index path:", index_path - print "bucketlist path:", bucketlist_path - print "buckets path:", buckets_dir + # display paths + if not SILENT: + print "index path:", index_path + print "bucketlist path:", bucketlist_path + print "buckets path:", buckets_dir with open(index_path) as f: reader = csv.reader(f) @@ -253,24 +260,30 @@ def build_index(items, tokenizer, outdir, buckets=NUM_BUCKETS, jobs=None, chunks if sum(map(bool,(args.scanner, args.max_order, args.word))) > 1: parser.error('can only specify one of --word, --scanner and --max_order') - # Tokenize - print "will tokenize %d files" % len(items) + # Tokenize + if not SILENT: + print "will tokenize %d files" % len(items) if args.scanner: from scanner import Scanner - tokenizer = Scanner.from_file(args.scanner) - print "using provided scanner: ", args.scanner + tokenizer = Scanner.from_file(args.scanner) + if not SILENT: + print "using provided scanner: ", args.scanner elif args.word: - tokenizer = str.split - print "using str.split to tokenize" + tokenizer = str.split + if not SILENT: + print "using str.split to tokenize" else: min_order = args.min_order if args.min_order else MIN_NGRAM_ORDER max_order = args.max_order if args.max_order else MAX_NGRAM_ORDER - tokenizer = NGramTokenizer(min_order,max_order) - print "using n-gram tokenizer: min_order({0}) max_order({1})".format(min_order,max_order) - if args.term_freq: - print "counting term frequency" - else: - print "counting document frequency" + tokenizer = NGramTokenizer(min_order,max_order) + if not SILENT: + print "using n-gram tokenizer: min_order({0}) max_order({1})".format(min_order,max_order) + if args.term_freq: + if not SILENT: + print "counting term frequency" + else: + if not SILENT: + print "counting document frequency" b_dirs = build_index(items, tokenizer, buckets_dir, args.buckets, args.jobs, args.chunksize, args.sample_count, args.sample_size, args.term_freq) # output the paths to the buckets diff --git a/langid/train/train.py b/langid/train/train.py index 85231ea0..30b62945 100644 --- a/langid/train/train.py +++ b/langid/train/train.py @@ -229,6 +229,7 @@ ig_vals = {} for label, dist, suffix, binarize in ig_params: print "Computing information gain for {0}".format(label) + print 'b_dirs:', b_dirs ig = compute_IG(b_dirs, DFfeats, dist, binarize, suffix, args.jobs) if args.debug: weights_path = os.path.join(model_dir, 'IGweights' + suffix + ('.bin' if binarize else '')) From c67aafe1bb7d39f66469ef53b7edc74b9ca7c401 Mon Sep 17 00:00:00 2001 From: Tarek Amr Date: Sun, 9 Mar 2014 13:26:28 +0200 Subject: [PATCH 4/5] Testing script for the 3 languages, test.py --- langid/__init__.py | 2 +- langid/langid.py | 8 +++++- langid/train/IGweight.py | 4 +-- test.py | 56 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 66 insertions(+), 4 deletions(-) create mode 100644 test.py diff --git a/langid/__init__.py b/langid/__init__.py index d6a60148..4146f620 100644 --- a/langid/__init__.py +++ b/langid/__init__.py @@ -1 +1 @@ -from langid import classify, rank, set_languages +from langid import classify, rank, set_languages, load_model diff --git a/langid/langid.py b/langid/langid.py index fe76893a..ccf01ae6 100644 --- a/langid/langid.py +++ b/langid/langid.py @@ -38,6 +38,8 @@ FORCE_WSGIREF = False NORM_PROBS = True # Normalize optput probabilities. +SILENT = True + # NORM_PROBS can be set to False for a small speed increase. It does not # affect the relative ordering of the predicted classes. @@ -102,7 +104,7 @@ def rank(instance): global identifier if identifier is None: load_model() - + return identifier.rank(instance) def cl_path(path): @@ -156,6 +158,8 @@ class LanguageIdentifier(object): @classmethod def from_modelstring(cls, string, *args, **kwargs): + if not SILENT: + print 'Loading model from string' model = loads(bz2.decompress(base64.b64decode(string))) nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = model nb_numfeats = len(nb_ptc) / len(nb_pc) @@ -168,6 +172,8 @@ def from_modelstring(cls, string, *args, **kwargs): @classmethod def from_modelpath(cls, path, *args, **kwargs): + if not SILENT: + print 'Loading model from path' with open(path) as f: return cls.from_modelstring(f.read(), *args, **kwargs) diff --git a/langid/train/IGweight.py b/langid/train/IGweight.py index 5a11658e..69e886b4 100644 --- a/langid/train/IGweight.py +++ b/langid/train/IGweight.py @@ -242,8 +242,8 @@ def read_dist(path): print "features path:", feature_path print "weights path:", weights_path print "index path:", index_path - print "suffix:", suffix - + print "suffix:", suffix + if not SILENT: print "computing information gain" # Compile buckets together diff --git a/test.py b/test.py new file mode 100644 index 00000000..4cfc98eb --- /dev/null +++ b/test.py @@ -0,0 +1,56 @@ +import langid +import os + +test_path = 'corpus-esaren.train' + +langid.load_model(path="corpus-esaren.model/model") + +class Accuracy: + + def __init__(self): + self.correct = 0 + self.incorrect = 0 + + def update(self, correct=True): + if correct: + self.correct += 1 + else: + self.incorrect += 1 + #print 'updates', self.correct, self.incorrect + + def evaluate(self): + total_cases = self.correct + self.incorrect + accuracy = self.correct * 100.0 / total_cases + print 'Accuracy = %f %% (of %d test cases)' % (accuracy, total_cases) + +def visit(arg, dirname, names): + path = dirname.split('/') + + if len(path) == 1: + #print names + for i in range(len(names)-1,0,-1): + if names[i].startswith('.'): + del names[i] + #print names + else: + lang = path[1] + #print arg, dirname, names + for name in names: + fd = open(dirname + '/' + name,'r') + for line in fd.readlines(): + res = langid.classify(line) + #print lang, ':', res + if lang == res[0]: + a.update(correct=True) + else: + #print 'incorrect:', lang, res + a.update(correct=False) + fd.close() + + +a = Accuracy() +os.path.walk(test_path, visit, '') +a.evaluate() + +#res = langid.classify("This is a test") +#print res \ No newline at end of file From fc3899c086b88c58c8fb745954115630845d4888 Mon Sep 17 00:00:00 2001 From: Tarek Amr Date: Sun, 9 Mar 2014 14:03:52 +0200 Subject: [PATCH 5/5] Moving testing module to corpuslib --- corpuslib/__init__.py | 1 + corpuslib/test.py | 54 +++++++++++++++++++++++++++++++++++++++++++ corpuslib/train.py | 0 test.py | 49 ++++----------------------------------- 4 files changed, 60 insertions(+), 44 deletions(-) create mode 100644 corpuslib/__init__.py create mode 100644 corpuslib/test.py create mode 100644 corpuslib/train.py diff --git a/corpuslib/__init__.py b/corpuslib/__init__.py new file mode 100644 index 00000000..e419f7af --- /dev/null +++ b/corpuslib/__init__.py @@ -0,0 +1 @@ +from test import Accuracy, Test \ No newline at end of file diff --git a/corpuslib/test.py b/corpuslib/test.py new file mode 100644 index 00000000..cfe8a8da --- /dev/null +++ b/corpuslib/test.py @@ -0,0 +1,54 @@ +import os + +class Test: + + def __init__(self, root='', langid=None, accuracy=None): + self.root = root + self.langid = langid + self.a = accuracy + + def visit(self, arg, dirname, names): + path = dirname.split('/') + + if len(path) == 1: + #print names + for i in range(len(names)-1,0,-1): + if names[i].startswith('.'): + del names[i] + #print names + else: + lang = path[1] + #print arg, dirname, names + for name in names: + fd = open(dirname + '/' + name,'r') + for line in fd.readlines(): + res = self.langid.classify(line) + #print lang, ':', res + if lang == res[0]: + self.a.update(correct=True) + else: + #print 'incorrect:', lang, res + self.a.update(correct=False) + fd.close() + + def start(self): + os.path.walk(self.root, self.visit, '') + +class Accuracy: + + def __init__(self): + self.correct = 0 + self.incorrect = 0 + + def update(self, correct=True): + if correct: + self.correct += 1 + else: + self.incorrect += 1 + #print 'updates', self.correct, self.incorrect + + def evaluate(self): + total_cases = self.correct + self.incorrect + accuracy = self.correct * 100.0 / total_cases + print 'Accuracy = %f %% (of %d test cases)' % (accuracy, total_cases) + diff --git a/corpuslib/train.py b/corpuslib/train.py new file mode 100644 index 00000000..e69de29b diff --git a/test.py b/test.py index 4cfc98eb..81366186 100644 --- a/test.py +++ b/test.py @@ -1,55 +1,16 @@ import langid import os +import corpuslib + test_path = 'corpus-esaren.train' langid.load_model(path="corpus-esaren.model/model") -class Accuracy: - - def __init__(self): - self.correct = 0 - self.incorrect = 0 - - def update(self, correct=True): - if correct: - self.correct += 1 - else: - self.incorrect += 1 - #print 'updates', self.correct, self.incorrect - - def evaluate(self): - total_cases = self.correct + self.incorrect - accuracy = self.correct * 100.0 / total_cases - print 'Accuracy = %f %% (of %d test cases)' % (accuracy, total_cases) - -def visit(arg, dirname, names): - path = dirname.split('/') - - if len(path) == 1: - #print names - for i in range(len(names)-1,0,-1): - if names[i].startswith('.'): - del names[i] - #print names - else: - lang = path[1] - #print arg, dirname, names - for name in names: - fd = open(dirname + '/' + name,'r') - for line in fd.readlines(): - res = langid.classify(line) - #print lang, ':', res - if lang == res[0]: - a.update(correct=True) - else: - #print 'incorrect:', lang, res - a.update(correct=False) - fd.close() - -a = Accuracy() -os.path.walk(test_path, visit, '') +a = corpuslib.Accuracy() +t = corpuslib.Test(test_path, langid=langid, accuracy=a) +t.start() a.evaluate() #res = langid.classify("This is a test")