bayes spam

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <time.h>
#include <sqlite.h>

/**********************************************************

  bayes.C - Craig Morrison, craig@2cah.com

  A C implementation of Paul Graham's "A Plan for Spam"
  techniques. For more information see:

      http://www.paulgraham.com/spam.html

  This is my donation to the internet community at large.

  I hereby place this code into the public domain, you
  are free to use, fold, spindle and mutilate at your
  leisure and risk.

  I do this in the hopes that others will help take a
  proactive stance on spam and write their own Bayesian
  filters.

  These are the results of my investigations, it is my
  sincere hope that it is of use and saves you some small
  measure of time researching this material.

  If you see anything here you don't like, rewrite it
  and pass the code along.

  This project has a home at Source Forge:

      http://sourceforge.net/projects/bayesiancfilter

  I am only going to place one restriction on this
  project; Do NOT use GPL'd libraries to add features.
  I want this filter to be free from ANY licensing
  restrictions.

**********************************************************/

#ifndef max
#define max(a,b) (((a)>(b))?(a):(b))
#endif
#ifndef min
#define min(a,b) (((a)<(b))?(a):(b))
#endif

// Maximum number of probabilities we'll use for classification
#define PA_COUNT (20)
// Combined probability of spam threshold as a percentage
// Default here is 90%
#define SPAM_THRESHOLD (90)
// Token seperator list
#define TOK_SEP_LIST " ,?-+*()[]{}<>;^/"/r/n/t"
// HTML comment tokens (still need to be addressed)
#define START_HTML_COMMENT "<!--"
#define STOP_HTML_COMMENT "-->"
// ----------------------------------------
// SQL Table names
#define TABLE_SPAM "Spam"
#define TABLE_HAM "Ham"
#define TABLE_HASH "Hash"
#define TABLE_WORK "More"
#define TABLE_WORDS "Smoke"
#define TABLE_COUNT "mCount"
#define TABLE_WHITELIST "WhiteList"
// SQL Field names
#define FIELD_WORD "Word"
#define FIELD_COUNT "Count"
// Common SQL statements
#define CREATE_TABLE "CREATE TABLE %s (Count FLOAT, Word VARCHAR(256) UNIQUE);"
#define CREATE_COUNT_TABLE "CREATE TABLE '%q' (Count INTEGER, Word VARCHAR(256) UNIQUE);"
#define CREATE_WORK_TABLE "CREATE TEMPORARY TABLE More (Count FLOAT, Word VARCHAR(256) UNIQUE);"
#define UPDATE_ROW "INSERT OR REPLACE INTO '%q' VALUES('%f','%q');"
#define GET_ROW "SELECT Count FROM %s WHERE Word='%s';"
#define SET_PRAGMA "PRAGMA default_synchronous = OFF;"

// forward references
double mapTokenP(double g, double nGood, double b, double nBad);
double compoundP(int count, double *p);

/**********************************************************

 Opens an SQLite database and creates the necessary
 schema.

**********************************************************/
sqlite *OpenDB(char *s)
{
 sqlite *db;

 db = sqlite_open(s, 0, NULL);
 if (db) {
  sqlite_exec(db, SET_PRAGMA, 0, 0, 0);
  sqlite_exec_printf(db, CREATE_TABLE, 0, 0, 0, TABLE_SPAM, NULL);
  sqlite_exec_printf(db, CREATE_TABLE, 0, 0, 0, TABLE_HAM, NULL);
  sqlite_exec_printf(db, CREATE_TABLE, 0, 0, 0, TABLE_WHITELIST, NULL);
  if (sqlite_exec_printf(db, CREATE_TABLE, 0, 0, 0, TABLE_COUNT, NULL)==SQLITE_OK) {
   sqlite_exec_printf(db, "INSERT INTO '%q' VALUES(0, 'nGood');", 0, 0, 0, TABLE_COUNT, NULL);
   sqlite_exec_printf(db, "INSERT INTO '%q' VALUES(0, 'nBad');", 0, 0, 0, TABLE_COUNT, NULL);
  }
 }

 return(db);
}

/**********************************************************

 Close an SQLite database.

**********************************************************/
void CloseDB(sqlite *db)
{
 if (db)
  sqlite_close(db);
}

/**********************************************************

 Updates the hash value of a word in a hash table.

**********************************************************/
int updateHash(char *tName, double f, char *word, sqlite *db)
{
 int sqlErr=0;

 do {
  sqlErr = sqlite_exec_printf(db, UPDATE_ROW, 0, 0, 0, tName, f, word, NULL);
 } while (sqlErr == SQLITE_BUSY);

 return(sqlErr);
}

/**********************************************************

 Grabs the hash value of a word from a hash table.

**********************************************************/
double getHash(char *word, char *tName, sqlite *db)
{
 sqlite_vm *pVm;
 char escWord[258], buf[16384], *pzTail, **pazValue, **pazColName, *s, *d;
 double hash=0.0;
 int i=0, sqlErr, columns;

 s = word;
 d = escWord;

 // escape all ' chars to form proper SQL statement
 while (*s) {
  if (*s=='/'') {
   *d = '/'';
   d++;
  }
  *d = *s;
  d++;
  s++;
  *d = 0;
  i++;
  if (i>255)
   break;
 }

 sprintf(buf, GET_ROW, tName, escWord);

 if (sqlite_compile(db, buf, &pzTail, &pVm, NULL)==SQLITE_OK) {

  while ((sqlErr = sqlite_step(pVm, &columns, &pazValue, &pazColName))!=SQLITE_DONE) {

   if (sqlErr==SQLITE_BUSY)
    continue;
   else if (sqlErr!=SQLITE_ROW)
    break;

   if (columns)
    hash = atof(pazValue[0]);

   break;
  }

  sqlite_finalize(pVm, NULL);
 }

 return(hash);
}

/**********************************************************

 These are the email headers we want to parse.

**********************************************************/
int goodHeaders(char *s)
{
 if (!strnicmp(s, "received", 8))
  return 1;
 if (!strnicmp(s, "subject", 7))
  return 1;

 return 0;
}

/**********************************************************

 createProbabilities scans through a merged word list
 from the Ham and Spam tables and calculates the
 probability of a given word being in a spam message.
 This probability gets stored in the Hash table.

**********************************************************/
void createProbabilities(sqlite *db, int bCount, int gCount)
{
 sqlite_vm *pVm;
 char *pzTail, **pazValue, **pazColName;
 double hash=0.0, bHash, gHash;
 int sqlErr, columns;

 sqlite_exec(db, "DROP TABLE Hash", 0, 0, 0);
 sqlite_exec(db, "CREATE TABLE Hash (Count FLOAT, Word VARCHAR(256) UNIQUE);", 0, 0, 0);

 if (sqlite_compile(db, "SELECT * FROM Smoke;", &pzTail, &pVm, NULL)==SQLITE_OK) {

  while ((sqlErr = sqlite_step(pVm, &columns, &pazValue, &pazColName))!=SQLITE_DONE) {

   if (sqlErr==SQLITE_BUSY)
    continue;
   else if (sqlErr!=SQLITE_ROW)
    break;

   if (columns) {
    bHash = getHash(pazValue[0], TABLE_SPAM, db);
    gHash = getHash(pazValue[0], TABLE_HAM, db)*2;

    if (gHash+bHash>5) {

     hash = mapTokenP(gHash, gCount, bHash, bCount);
     if (hash<0.0001)
      hash = (gHash<bHash) ? 0.99 : 0.10;

     updateHash(TABLE_HASH, hash, pazValue[0], db);
    }
   }
  }

  sqlite_finalize(pVm, NULL);
 }
}

/**********************************************************

 This function takes a list of words, which have been
 parsed from a message and first, sorts the list by
 the count field of the More table. The More table
 holds hashes that are the difference from the median
 of 0.5. What this is all about is it allows us to
 grab the words that have a probability the farthest
 from the median. IOW, it gives us the most 'interesting'
 words in the message.

 A list of doubles is returned. Each one is the probability
 of a word ocurring in a spam.

**********************************************************/
double *orderProbabilties(sqlite *db, int *count)
{
 sqlite_vm *pVm;
 char *pzTail, **pazValue, **pazColName;
 double hash=0.0, *pA;
 int sqlErr, columns, i=0;

 pA = malloc(PA_COUNT * sizeof(double));
 if (pA) {
  memset(pA, 0, sizeof(double)*PA_COUNT);
  if (sqlite_compile(db, "SELECT Word FROM More ORDER BY Count DESC;", &pzTail, &pVm, NULL)==SQLITE_OK) {

   while ((sqlErr = sqlite_step(pVm, &columns, &pazValue, &pazColName))!=SQLITE_DONE) {

    if (sqlErr==SQLITE_BUSY)
     continue;
    else if (sqlErr!=SQLITE_ROW)
     break;

    if (columns) {
     pA[i] = getHash(pazValue[0], TABLE_HASH, db);
     i++;
     if (i>=PA_COUNT)
      break;
    }
   }

   sqlite_finalize(pVm, NULL);
  }
 }

 *count = i;

 return(pA);
}

/**********************************************************

 Parses a message file to train the filter.

**********************************************************/
int parseFile(char *filename, char *tName, sqlite *db)
{
 FILE *in;
 char *t, *t1, buf[2048];
 double hash;
 int inHeaders=1, i=0, rc=0, inHTMLComment=0;

 if (in = fopen(filename, "r")) {

  while (fgets(buf, sizeof(buf), in)) {

   if (inHeaders) {
    t = strchr(buf, '/r');
    if (t) *t = 0;
    t = strchr(buf, '/n');
    if (t) *t = 0;
    if (!(buf[0])) {
     inHeaders = 0;
     continue;
    }

    if (!goodHeaders(buf))
     continue;
   }

   // toss away everything between <!-- and -->
   while(strstr(buf, START_HTML_COMMENT)) {
    if (inHTMLComment) {
     if (t = strstr(buf, STOP_HTML_COMMENT)) {
      strcpy(buf, &t[strlen(STOP_HTML_COMMENT)]);
      inHTMLComment = 0;
     }
     else {
      buf[0] = 0;
      break;
     }
    }
    else if (t = strstr(buf, START_HTML_COMMENT)) {
     if (t1 = strstr(t, STOP_HTML_COMMENT)) {
      strcpy(t, &t1[strlen(STOP_HTML_COMMENT)]);
     }
     else {
      *t = 0;
      inHTMLComment=1;
      break;
     }
    }
   }

   t = strtok(buf, TOK_SEP_LIST);
   while(t) {

    // strip off trailing '.'
    i = strlen(t);
    if ( (i) && (t[i-1]=='.')) {
     if (t[i-1]=='.')
      t[i-1] = 0;
    }

    if (*t) {
     hash = getHash(t, tName, db);
     updateHash(tName, hash+1, t, db);
    }

    t = strtok(NULL, TOK_SEP_LIST);
   }
  }
  rc = 1;
  fclose(in);
 }
 else
  fprintf(stderr, "/nError opening %s.../n", filename);

 return(rc);
}

/**********************************************************

 Parses a message file to classify a message.

**********************************************************/
int parseMessage(char *filename, sqlite *db)
{
 FILE *in;
 char *t, *t1, buf[2048];
 double hash, *pA, prob;
 int count=0, inHeaders=1, i=0, rc=0, addHash, inHTMLComment=0;

 if (in = fopen(filename, "r")) {
  sqlite_exec(db, CREATE_WORK_TABLE, 0, 0, 0);

  while (fgets(buf, sizeof(buf), in)) {

   if (inHeaders) {
    t = strchr(buf, '/r');
    if (t) *t = 0;
    t = strchr(buf, '/n');
    if (t) *t = 0;
    if (!buf[0]) {
     inHeaders = 0;
     continue;
    }

    if (!goodHeaders(buf))
     continue;
   }

   // toss away everything between <!-- and -->
   while(strstr(buf, START_HTML_COMMENT)) {
    if (inHTMLComment) {
     if (t = strstr(buf, STOP_HTML_COMMENT)) {
      strcpy(buf, &t[strlen(STOP_HTML_COMMENT)]);
      inHTMLComment = 0;
     }
     else {
      buf[0] = 0;
      break;
     }
    }
    else if (t = strstr(buf, START_HTML_COMMENT)) {
     if (t1 = strstr(t, STOP_HTML_COMMENT)) {
      strcpy(t, &t1[strlen(STOP_HTML_COMMENT)]);
     }
     else {
      *t = 0;
      inHTMLComment=1;
      break;
     }
    }
   }

   // parse out the tokens in buf
   t = strtok(buf, TOK_SEP_LIST);
   while(t) {

    addHash = 0;
    // strip off trailing '.'
    i = strlen(t);
    if ( (i) && (t[i-1]=='.')) {
     if (t[i-1]=='.')
      t[i-1] = 0;
    }

    if (*t) {
     hash = getHash(t, TABLE_HASH, db);
     // this is where we figure the difference from the
     // median so we can get 'interesting' words
     if (hash>0.5)
      hash = hash - 0.5;
     else
      hash = 0.5 - hash;

     if (hash == 0.5) {
      hash = 0.1;
     }

     updateHash(TABLE_WORK, hash, t, db);
    }

    t = strtok(NULL, TOK_SEP_LIST);
   }
  }

  pA = orderProbabilties(db, &count);
  if (pA) {
   prob = compoundP(count, pA);
   printf("pA = %f/n", prob);
   if (prob*100>SPAM_THRESHOLD)
    rc = 1;
  }
  fclose(in);
 }
 else
  fprintf(stderr, "/nError opening %s.../n", filename);

 return(rc);
}

/**********************************************************

  Calculate:

      ab
  ----------------
  ab + (1-a) (1-b)

**********************************************************/
double compoundP(int count, double *p)
{
 double n, inv, pSet;
 int i;

 for (n = p[0],i=1;i<count;i++) {
  n = n * p[i];
 }
 for (inv = 1-p[0],i=1;i<count;i++) {
  inv = inv * (1-p[i]);
 }

 pSet = n / (n + inv);

 return(pSet);
}

/**********************************************************

  What mapTokenP does is:

  "Next I create a third hash table, this time mapping
   each token to the probability that an email containing
   it is a spam, which I calculate as follows [1]:
  
  (let ((g (* 2 (or (gethash word good) 0)))
     (b (or (gethash word bad) 0)))
  (unless (< (+ g b) 5)
  (max .01
   (min .99 (float (/ (min 1 (/ b nbad))
    (+ (min 1 (/ g ngood))
       (min 1 (/ b nbad)))))))))

   where word is the token whose probability we're
   calculating, good and bad are the hash tables I created
   in the first step, and ngood and nbad are the number of
   nonspam and spam messages respectively."

  In reality here, as you can see, all we are dealing with
  in this function is the portion of the above code that
  does the math. The logic for the above will be held else
  where.

  [NB] I should probably implement this as an inline
       macro, but didn't here for clarity of purpose.

**********************************************************/

double mapTokenP(double g, double nGood, double b, double nBad)
{
 return( max(.01,(min(0.99,min(1.0,b/nBad)/min(1.0,min(1.0,(g*2)/nGood)+min(1.0,b/nBad))))) );
}

// d:/bayes/gmsg.eml
// -g d:/bayes/g*.eml -b d:/bayes/b*.eml
// Train
// bayes -g *good.eml -b *bad.eml
// Classify
// bayes message.eml
int main(int argc, char **argv)
{
 int nGood, nBad, i, procBad, newGood=0, newBad=0, msgs=0, rc=0;
 sqlite *db;
 time_t start, mStart, finish, minutes;

 nGood = 0;
 nBad = 0;

 if (argc==3) {

  db = OpenDB(argv[1]);
  if (!db) {
   fprintf(stderr, "ERROR: couldn't open database %s.../n", argv[1]);
   return(128);
  }

  printf("Classifying %s.../n", argv[2]);
  start = time(NULL);
  if (rc = parseMessage(argv[2], db))
   printf("%s should be marked as spam./n", argv[2]);
  else
   printf("%s should NOT be marked as spam./n", argv[2]);
  printf("%u seconds to complete../n", time(NULL)-start);
 }
 else if (argc>3) {


  db = OpenDB(argv[1]);

  start = time(NULL);
  nGood = (int)getHash("nGood", TABLE_COUNT, db);
  nBad = (int)getHash("nBad", TABLE_COUNT, db);

  if ((stricmp(argv[2], "-g")) && (stricmp(argv[2], "-b")))
   printf("-g or -b MUST be the first argument!/n");
  else {
   for (i=2;i<argc;i++) {
    if (!stricmp(argv[i], "-g"))
     procBad = 0;
    else if (!stricmp(argv[i], "-b"))
     procBad = 1;
    else {
     printf("Parsing %s messsage: %s...", procBad ? "Spam" : "Ham", argv[i]);
     mStart = time(NULL);
     if (procBad) {
      if (parseFile(argv[i], TABLE_SPAM, db)) {
       nBad++;
       msgs++;
       newBad++;
      }
     }
     else {
      if (parseFile(argv[i], TABLE_HAM, db)) {
       nGood++;
       msgs++;
       newGood++;
      }
     }
     printf("(%u)/n", time(NULL)-mStart);
    }
   }
  }

  printf("Saving message counts (%d Good, %d Bad).../n", nGood, nBad);
  updateHash(TABLE_COUNT, nBad, "nBad", db);
  updateHash(TABLE_COUNT, nGood, "nGood", db);
  printf("Creating temporary word table.../n");
  sqlite_exec(db, "CREATE TEMPORARY TABLE Smoke (Word VARCHAR(256));", 0, 0, 0);
  printf("Populating temporary word table.../n");
  sqlite_exec(db, "INSERT INTO Smoke SELECT Word FROM Spam UNION SELECT Word FROM Ham;", 0, 0, 0);

  printf("Creating probability hash table.../n");
  createProbabilities(db, nBad, nGood);

  printf("Cleaning up SQL database.../n");
  sqlite_exec(db, "VACUUM;", 0, 0, 0);

  finish = time(NULL);
  minutes=finish-start;
  printf("Processing of %u messages took %u seconds (%um) to complete./n", msgs, minutes, minutes/60);
  rc=0;
 }
 else {
  printf("Usage:/n/n");
  printf("To train:    bayes d:/bayes/bayes.db -g *good.msg -b *bad.msg/n");
  printf("To classify: bayes d:/bayes/bayes.db messsage.msg/n");
 }

 if (db) CloseDB(db);

 return(rc);
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值