#!/usr/local/bin/python import sys, math import optparse, fileinput optparser = optparse.OptionParser(usage="%prog [options] INPUT_FILENAME*") optparser.add_option('-m', '--model', dest='model_filename', type='str', default=None, help='language model file') optparser.add_option('-o', '--output', dest='output_filename', type='str', default="-", help='output file') (opts,args) = optparser.parse_args() if opts.model_filename is None: sys.stderr.write("Error: must provide exactly one language model file\n") sys.exit(1) model_file = file(opts.model_filename) ngrams = {} n = None for line in model_file: (ngram,p) = line.split("\t") ngram = tuple(ngram.split()) if n is None: n = len(ngram) ngrams[ngram] = float(p) if opts.output_filename == "-": output_file = sys.stdout else: output_file = file(opts.output_filename, "w") for line in fileinput.input(args): words = line.split() words = [''] * (n-1) + words + [''] p = 0.0 try: for i in xrange(len(words)-n+1): p += math.log10(ngrams.get(tuple(words[i:i+n]), 0.0)) output_file.write(line.strip() + "\t => exp(%f)\n" % p) except OverflowError: output_file.write(line.strip() + "\t => zero\n") output_file.write