#!/usr/bin/env python

"""
Version 0.9
Requires bogofilter 0.94.4 or later.

Example:
  bogominitrain.py -fnv -b "-o 0.95,0.35" hamfile spamfile

Notes:

    This is a python rewrite of the original perl bogominitrain.pl that is
    included with bogofilter. I rewrote it so I could debug it. The original
    had a problem with my spamfile of 25000 message. It would just hang. 

    This version also handles arguments a bit differently. It takes 1 hamfile
    and 1 spamfile instead of the shell expanded string. It also will
    automatically find your bogofilter database, so it isn't required on the
    command line. Though you can provide it if you want to.

    There are a couple other small changes. Just run with -h for a rundown.

    It still uses the  "train on error" process just like the original. Thus
    building minimal wordlists that can still correctly score all messages.
    This aspect of the script is unchanged from the perl version.

    It may be a good idea to run this script command several times or use the
    '-f' option to run the script until no scoring errors occur (training to
    exhaustion).
    
    Tips snipped from the original docs:

    To improve bogofilter's accuracy, use bogofilter's -o option to
    create a "security margin" around your normal cutoff during
    training.  The script will train so that the messages will avoid
    this interval, i.e., all messages in your training mboxes will be
    marked as ham or spam with values far from your production cutoff.
    For example you might want to use spam_cutoff=0.5 and '-o 0.8,0.2'
    as bogofilter options.  If you would rather use tri-state mode, you
    can just center this around 0.5 and again use '-o 0.8,0.2'.

    To correct the classification of a message, just move it to the correct
    mbox and repeat the full training process (which will add a few messages to
    the existing database).

"""

from optparse import OptionParser, make_option
import mailbox
import sys, os, re, time

option_list = [
    make_option("-b", "--bogofilter", dest="bogofilter",
        help="Passed through to bogofilter as arguments (put in quotes)."),
    make_option("-c", "--compact", action="store_true", dest="compact",
        default=False, help="Compacts the database at the end."),
    make_option("-d", "--database-directory", dest="database",
        metavar="DIRECTORY", help="Bogofilter database directory."),
    make_option("-f", "--force", action="store_true", dest="force",
        default=False, help="Runs the program until no errors remain."),
    make_option("-i", "--info", action="store_true", dest="info",
        default=False, help="Show extended documentation and exit."),
    make_option("-n", "--noreps", action="store_true", dest="noreps",
        default=False, help="Prevents messages from being added more than"
            "once. Recommended to use with -f to prevent errors left at end."),
    make_option("-s", "--save", action="store_true", dest="save",
        default=False, help="Saves the messages used for training to files"
            " bogominitrain.ham and bogominitrain.spam"),
    make_option("-v", "--verbose", action="count", dest="verbose",
        help="This switch produces info on messages used for training. "
            "Given twice also lists messages not used for training."),
    ]
usage = "usage %prog [options] ham-mbox spam-mbox"
version = __doc__.split('\n')[1]

parser = OptionParser(usage=usage, option_list=option_list, version=version)
(options, args) = parser.parse_args()

if options.info:
    print __doc__
    sys.exit(0)

if len(args) < 2:
    print "Not enough arguments."
    parser.print_help()
    sys.exit(1)
else:
    hambox_file, spambox_file = args


# Find database directory if not specified
if not options.database:
    # start with shortcut for common case
    default = os.path.expandvars("$HOME/.bogofilter")
    if os.path.exists(default):
        options.database = default
    # ok. how about environmental var.
    if os.environ.has_key('BOGOFILTER_DIR'):
        options.database = os.environ['BOGOFILTER_DIR']
    # No. Then lets check the config files.
    config_files=[os.path.expandvars("$HOME/.bogofilter.cf"),
            "/etc/bogofilter.cf"]
    find_dir = re.compile("[^#]*\s*bogofilter_dir=([^#]+).*").match
    for config_file in config_files:
        if options.database: break
        if os.path.exists(config_file):
             for line in open(config_file):
                 dir_match = find_dir(line)
                 if dir_match:
                     options.database = dir_match.group(1).strip()
                     break
    if options.database:
        # fully expand path
        db = os.path.expanduser(options.database)
        options.database = os.path.expandvars(db)
    else:
        print >> sys.stderr, (
                "Database directory could not be determined.\n"
                "Please use -d argument to specify it.\n"
                "See help (-h) for options")
        sys.exit(1)

# set some initial values
bogofilter = "bogofilter"
if options.bogofilter:
    bogofilter = "%s %s" % (bogofilter, options.bogofilter)
bogofilter = "%s -d %s" % (bogofilter, options.database)

wordlist = os.path.join(options.database, 'wordlist.db')
if os.path.exists(wordlist):
    print "Starting with database containing:"
    os.system("bogoutil -w %s .MSG_COUNT" % options.database)

if not (os.path.exists(wordlist) and os.path.getsize(wordlist)):
    os.system(bogofilter + " -n < /dev/null")

try:
    hambox = mailbox.UnixMailbox(file(hambox_file,'r'), lambda fp: fp)
    spambox = mailbox.UnixMailbox(file(spambox_file,'r'), lambda fp: fp)
except IOError, arg:
    print
    print 'Cannot open spam/ham file!'
    print arg.strerror, arg.filename
    sys.exit(1)

runs = 0
ham_total = sum(1 for _ in hambox)
spam_total = sum(1 for _ in spambox)
status_conv = ("spam", "ham", "unsure", "error" )
trainedham = dict.fromkeys(range(1,ham_total+1),0)
trainedspam = dict.fromkeys(range(1,spam_total+1),0)

def eof(mbox):
    seekp = mbox.seekp
    result = not bool(mbox.next())
    mbox.seekp = seekp
    return result

def process(mbox, mtype, added, trained, count, ocount, total, ototal):
    """ mbox    = hambox, spambox
        mtype   = "spam","ham"
        added   = ham_added, spam_added
        trained = trainedspam, trainedham
        count   = ham_count, spam_count
        ocount  = opposite count
        total   = ham_total, spam_total
        ototal  = opposite total
    """
    if not (eof(mbox) or (count*ototal > ocount*total)):
        msg = mbox.next()
        if msg:
            msg = msg.read()
            count += 1
            pipe = popen(bogofilter,"w")
            pipe.write(msg)
            # bitshift to convert to exit status
            status = status_conv[(pipe.close() or 0) >> 8]
            if not status == mtype:
                if not (options.noreps and trained[count]):
                    trainer = mtype == "ham" and " -n" or " -s"
                    pipe = popen(bogofilter+trainer,"w")
                    pipe.write(msg)
                    pipe.close()
                    added += 1
                    trained[count] += 1
                    if options.verbose:
                        print status,
                        print "Training %s message %s" % (mtype, count),
                        if trained[count] > 1:
                            print "(%s)" % trained[count]
                        print
                    if options.save:
                        open("bogominitrain.%s" % mtype,"a").write(msg)
                elif options.verbose:
                    print status,
                    print "-- Skipping %s message %s" % (mtype, count)
            elif options.verbose:
                print status,
                print "-- Not training %s message %s" % (mtype, count)
            #
    return count, added

# reset mbox objects back to beginning
hambox.seekp = 0
spambox.seekp = 0

popen = os.popen
while True:
    starttime = time.time()
    runs += 1
    ham_added = spam_added = 0
    ham_count = spam_count = 0
    skip_ham = skip_spam = 0

    while True:
        ham_count, ham_added = process(
                hambox, "ham", ham_added, trainedham,
                ham_count, spam_count, ham_total, spam_total)
        spam_count, spam_added = process(
                spambox, "spam", spam_added, trainedspam,
                spam_count, ham_count, spam_total, ham_total)
        # 2-loop exit
        if eof(hambox) and eof(spambox):
            break
    
    hambox.seekp = 0
    spambox.seekp = 0

    print
    print "End of run #%d (in %.2fs):" % (runs ,(time.time() - starttime))
    print "Read %d ham and %d spam." % (ham_count, spam_count)
    print "Added %d ham (skipping %d) and" % (ham_added, skip_ham),
    print "%d spam (skipping %d) to the database" % (spam_added, skip_spam)
    os.system("bogoutil -w %s .MSG_COUNT" % options.database)

    false_negs = false_pos = 0
    if (ham_added + spam_added) != 0:
        starttime = time.time()
        false_negs = int(popen("cat %s | %s -TM | grep -cv \^S"
                % (spambox_file,bogofilter)).read())
        false_pos = int(popen("cat %s | %s -TM | grep -cv \^H"
                % (hambox_file,bogofilter)).read())
        print 
        print "False Negatives", false_negs
        print "False Positives", false_pos
        print "Verification done in %.2fs" % (time.time() - starttime)
        time.sleep(2)

    # main loop exit
    if (false_negs+false_pos)==0 or (ham_added+spam_added)==0 or \
            not options.force:
        break

if options.force:
    print
    print "%d run%s" % (runs ,(runs>1 and "s")),
    print "needed to close off."

if options.compact:
  print "Compacting database ..."
  os.system("bf_compact %s && rm -rf %s.old" % ((options.database,)*2))



