#!/usr/local/bin/python import sys import optparse, fileinput optparser = optparse.OptionParser(usage="%prog [options] INPUT_FILENAME*") optparser.add_option('-n', '--n', dest='n', type='int', default=2, help='size of n-grams') optparser.add_option('-o', '--output', dest='output_filename', type='str', default="-", help='output file') (opts,args) = optparser.parse_args() n = opts.n ngrams = {} for line in fileinput.input(args): words = line.split() words = [''] * (n-1) + words + [''] for i in xrange(len(words)-n+1): context = tuple(words[i:i+n-1]) word = words[i+n-1] ngrams.setdefault(context, {}).setdefault(word, 0) ngrams[context][word] += 1 if opts.output_filename == "-": output_file = sys.stdout else: output_file = file(opts.output_filename, "w") for (context, dist) in ngrams.iteritems(): denom = sum(dist.itervalues()) for (word, c) in dist.iteritems(): output_file.write("%s %s\t%f\n" % (" ".join(context), word, float(c)/denom)) output_file.close()