bayes spam

最新推荐文章于 2024-07-15 11:00:33 发布

zcg19

最新推荐文章于 2024-07-15 11:00:33 发布

阅读量446

点赞数

文章标签： sqlite table null html insert list

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <time.h>
#include <sqlite.h>

/**********************************************************

bayes.C - Craig Morrison, craig@2cah.com

A C implementation of Paul Graham's "A Plan for Spam"
techniques. For more information see:

http://www.paulgraham.com/spam.html

This is my donation to the internet community at large.

I hereby place this code into the public domain, you
are free to use, fold, spindle and mutilate at your
leisure and risk.

I do this in the hopes that others will help take a
proactive stance on spam and write their own Bayesian
filters.

These are the results of my investigations, it is my
sincere hope that it is of use and saves you some small
measure of time researching this material.

If you see anything here you don't like, rewrite it
and pass the code along.

This project has a home at Source Forge:

http://sourceforge.net/projects/bayesiancfilter

I am only going to place one restriction on this
project; Do NOT use GPL'd libraries to add features.
I want this filter to be free from ANY licensing
restrictions.

**********************************************************/

#ifndef max
#define max(a,b) (((a)>(b))?(a):(b))
#endif
#ifndef min
#define min(a,b) (((a)<(b))?(a):(b))
#endif

// Maximum number of probabilities we'll use for classification
#define PA_COUNT (20)
// Combined probability of spam threshold as a percentage
// Default here is 90%
#define SPAM_THRESHOLD (90)
// Token seperator list
#define TOK_SEP_LIST " ,?-+*()[]{}<>;^/"/r/n/t"
// HTML comment tokens (still need to be addressed)
#define START_HTML_COMMENT ""
// ----------------------------------------
// SQL Table names
#define TABLE_SPAM "Spam"
#define TABLE_HAM "Ham"
#define TABLE_HASH "Hash"
#define TABLE_WORK "More"
#define TABLE_WORDS "Smoke"
#define TABLE_COUNT "mCount"
#define TABLE_WHITELIST "WhiteList"
// SQL Field names
#define FIELD_WORD "Word"
#define FIELD_COUNT "Count"
// Common SQL statements
#define CREATE_TABLE "CREATE TABLE %s (Count FLOAT, Word VARCHAR(256) UNIQUE);"
#define CREATE_COUNT_TABLE "CREATE TABLE '%q' (Count INTEGER, Word VARCHAR(256) UNIQUE);"
#define CREATE_WORK_TABLE "CREATE TEMPORARY TABLE More (Count FLOAT, Word VARCHAR(256) UNIQUE);"
#define UPDATE_ROW "INSERT OR REPLACE INTO '%q' VALUES('%f','%q');"
#define GET_ROW "SELECT Count FROM %s WHERE Word='%s';"
#define SET_PRAGMA "PRAGMA default_synchronous = OFF;"

// forward references
double mapTokenP(double g, double nGood, double b, double nBad);
double compoundP(int count, double *p);

/**********************************************************

Opens an SQLite database and creates the necessary
schema.

**********************************************************/
sqlite *OpenDB(char *s)
{
sqlite *db;

db = sqlite_open(s, 0, NULL);
if (db) {
  sqlite_exec(db, SET_PRAGMA, 0, 0, 0);
  sqlite_exec_printf(db, CREATE_TABLE, 0, 0, 0, TABLE_SPAM, NULL);
  sqlite_exec_printf(db, CREATE_TABLE, 0, 0, 0, TABLE_HAM, NULL);
  sqlite_exec_printf(db, CREATE_TABLE, 0, 0, 0, TABLE_WHITELIST, NULL);
  if (sqlite_exec_printf(db, CREATE_TABLE, 0, 0, 0, TABLE_COUNT, NULL)==SQLITE_OK) {
   sqlite_exec_printf(db, "INSERT INTO '%q' VALUES(0, 'nGood');", 0, 0, 0, TABLE_COUNT, NULL);
   sqlite_exec_printf(db, "INSERT INTO '%q' VALUES(0, 'nBad');", 0, 0, 0, TABLE_COUNT, NULL);
  }
}

return(db);
}

/**********************************************************

Close an SQLite database.

**********************************************************/
void CloseDB(sqlite *db)
{
if (db)
sqlite_close(db);
}

/**********************************************************

Updates the hash value of a word in a hash table.

**********************************************************/
int updateHash(char *tName, double f, char *word, sqlite *db)
{
int sqlErr=0;

do {
sqlErr = sqlite_exec_printf(db, UPDATE_ROW, 0, 0, 0, tName, f, word, NULL);
} while (sqlErr == SQLITE_BUSY);

return(sqlErr);
}

/**********************************************************

Grabs the hash value of a word from a hash table.

**********************************************************/
double getHash(char *word, char *tName, sqlite *db)
{
sqlite_vm *pVm;
char escWord[258], buf[16384], *pzTail, **pazValue, **pazColName, *s, *d;
double hash=0.0;
int i=0, sqlErr, columns;

s = word;
d = escWord;

// escape all ' chars to form proper SQL statement
while (*s) {
  if (*s=='/'') {
   *d = '/'';
   d++;
  }
  *d = *s;
  d++;
  s++;
  *d = 0;
  i++;
  if (i>255)
   break;
}

sprintf(buf, GET_ROW, tName, escWord);

if (sqlite_compile(db, buf, &pzTail, &pVm, NULL)==SQLITE_OK) {

while ((sqlErr = sqlite_step(pVm, &columns, &pazValue, &pazColName))!=SQLITE_DONE) {

   if (sqlErr==SQLITE_BUSY)
    continue;
   else if (sqlErr!=SQLITE_ROW)
    break;

if (columns)
hash = atof(pazValue[0]);

break;
}

sqlite_finalize(pVm, NULL);
}

return(hash);
}

/**********************************************************

These are the email headers we want to parse.

**********************************************************/
int goodHeaders(char *s)
{
if (!strnicmp(s, "received", 8))
return 1;
if (!strnicmp(s, "subject", 7))
return 1;

return 0;
}

/**********************************************************

createProbabilities scans through a merged word list
from the Ham and Spam tables and calculates the
probability of a given word being in a spam message.
This probability gets stored in the Hash table.

**********************************************************/
void createProbabilities(sqlite *db, int bCount, int gCount)
{
sqlite_vm *pVm;
char *pzTail, **pazValue, **pazColName;
double hash=0.0, bHash, gHash;
int sqlErr, columns;

sqlite_exec(db, "DROP TABLE Hash", 0, 0, 0);
sqlite_exec(db, "CREATE TABLE Hash (Count FLOAT, Word VARCHAR(256) UNIQUE);", 0, 0, 0);

if (sqlite_compile(db, "SELECT * FROM Smoke;", &pzTail, &pVm, NULL)==SQLITE_OK) {

while ((sqlErr = sqlite_step(pVm, &columns, &pazValue, &pazColName))!=SQLITE_DONE) {

   if (sqlErr==SQLITE_BUSY)
    continue;
   else if (sqlErr!=SQLITE_ROW)
    break;

   if (columns) {
    bHash = getHash(pazValue[0], TABLE_SPAM, db);
    gHash = getHash(pazValue[0], TABLE_HAM, db)*2;

if (gHash+bHash>5) {

hash = mapTokenP(gHash, gCount, bHash, bCount);
 if (hash<0.0001)
 hash = (gHash<bHash) ? 0.99 : 0.10;

     updateHash(TABLE_HASH, hash, pazValue[0], db);
    }
   }
  }

sqlite_finalize(pVm, NULL);
}
}

/**********************************************************

This function takes a list of words, which have been
parsed from a message and first, sorts the list by
the count field of the More table. The More table
holds hashes that are the difference from the median
of 0.5. What this is all about is it allows us to
grab the words that have a probability the farthest
from the median. IOW, it gives us the most 'interesting'
words in the message.

A list of doubles is returned. Each one is the probability
of a word ocurring in a spam.

**********************************************************/
double *orderProbabilties(sqlite *db, int *count)
{
sqlite_vm *pVm;
char *pzTail, **pazValue, **pazColName;
double hash=0.0, *pA;
int sqlErr, columns, i=0;

pA = malloc(PA_COUNT * sizeof(double));
if (pA) {
memset(pA, 0, sizeof(double)*PA_COUNT);
if (sqlite_compile(db, "SELECT Word FROM More ORDER BY Count DESC;", &pzTail, &pVm, NULL)==SQLITE_OK) {

while ((sqlErr = sqlite_step(pVm, &columns, &pazValue, &pazColName))!=SQLITE_DONE) {

    if (sqlErr==SQLITE_BUSY)
     continue;
    else if (sqlErr!=SQLITE_ROW)
     break;

    if (columns) {
     pA[i] = getHash(pazValue[0], TABLE_HASH, db);
     i++;
     if (i>=PA_COUNT)
      break;
    }
   }

sqlite_finalize(pVm, NULL);
}
}

*count = i;

return(pA);
}

/**********************************************************

Parses a message file to train the filter.

**********************************************************/
int parseFile(char *filename, char *tName, sqlite *db)
{
FILE *in;
char *t, *t1, buf[2048];
double hash;
int inHeaders=1, i=0, rc=0, inHTMLComment=0;

if (in = fopen(filename, "r")) {

while (fgets(buf, sizeof(buf), in)) {

   if (inHeaders) {
    t = strchr(buf, '/r');
    if (t) *t = 0;
    t = strchr(buf, '/n');
    if (t) *t = 0;
    if (!(buf[0])) {
     inHeaders = 0;
     continue;
    }

    if (!goodHeaders(buf))
     continue;
   }

// toss away everything between 
 while(strstr(buf, START_HTML_COMMENT)) {
 if (inHTMLComment) {
 if (t = strstr(buf, STOP_HTML_COMMENT)) {
 strcpy(buf, &t[strlen(STOP_HTML_COMMENT)]);
 inHTMLComment = 0;
 }
 else {
 buf[0] = 0;
 break;
 }
 }
 else if (t = strstr(buf, START_HTML_COMMENT)) {
 if (t1 = strstr(t, STOP_HTML_COMMENT)) {
 strcpy(t, &t1[strlen(STOP_HTML_COMMENT)]);
 }
 else {
 *t = 0;
 inHTMLComment=1;
 break;
 }
 }
 }

t = strtok(buf, TOK_SEP_LIST);
while(t) {

    // strip off trailing '.'
    i = strlen(t);
    if ( (i) && (t[i-1]=='.')) {
     if (t[i-1]=='.')
      t[i-1] = 0;
    }

    if (*t) {
     hash = getHash(t, tName, db);
     updateHash(tName, hash+1, t, db);
    }

    t = strtok(NULL, TOK_SEP_LIST);
   }
  }
  rc = 1;
  fclose(in);
}
else
  fprintf(stderr, "/nError opening %s.../n", filename);

return(rc);
}

/**********************************************************

Parses a message file to classify a message.

**********************************************************/
int parseMessage(char *filename, sqlite *db)
{
FILE *in;
char *t, *t1, buf[2048];
double hash, *pA, prob;
int count=0, inHeaders=1, i=0, rc=0, addHash, inHTMLComment=0;

if (in = fopen(filename, "r")) {
sqlite_exec(db, CREATE_WORK_TABLE, 0, 0, 0);

while (fgets(buf, sizeof(buf), in)) {

   if (inHeaders) {
    t = strchr(buf, '/r');
    if (t) *t = 0;
    t = strchr(buf, '/n');
    if (t) *t = 0;
    if (!buf[0]) {
     inHeaders = 0;
     continue;
    }

    if (!goodHeaders(buf))
     continue;
   }

   // parse out the tokens in buf
   t = strtok(buf, TOK_SEP_LIST);
   while(t) {

    addHash = 0;
    // strip off trailing '.'
    i = strlen(t);
    if ( (i) && (t[i-1]=='.')) {
     if (t[i-1]=='.')
      t[i-1] = 0;
    }

    if (*t) {
     hash = getHash(t, TABLE_HASH, db);
     // this is where we figure the difference from the
     // median so we can get 'interesting' words
     if (hash>0.5)
      hash = hash - 0.5;
     else
      hash = 0.5 - hash;

     if (hash == 0.5) {
      hash = 0.1;
     }

updateHash(TABLE_WORK, hash, t, db);
}

    t = strtok(NULL, TOK_SEP_LIST);
   }
  }

  pA = orderProbabilties(db, &count);
  if (pA) {
   prob = compoundP(count, pA);
   printf("pA = %f/n", prob);
   if (prob*100>SPAM_THRESHOLD)
    rc = 1;
  }
  fclose(in);
}
else
  fprintf(stderr, "/nError opening %s.../n", filename);

return(rc);
}

/**********************************************************

Calculate:

      ab
  ----------------
  ab + (1-a) (1-b)

**********************************************************/
double compoundP(int count, double *p)
{
double n, inv, pSet;
int i;

for (n = p[0],i=1;i<count;i++) {
n = n * p[i];
}
for (inv = 1-p[0],i=1;i<count;i++) {
inv = inv * (1-p[i]);
}

pSet = n / (n + inv);

return(pSet);
}

/**********************************************************

What mapTokenP does is:

"Next I create a third hash table, this time mapping
 each token to the probability that an email containing
 it is a spam, which I calculate as follows [1]:

(let ((g (* 2 (or (gethash word good) 0)))
 (b (or (gethash word bad) 0)))
(unless (< (+ g b) 5)
 (max .01
 (min .99 (float (/ (min 1 (/ b nbad))
 (+ (min 1 (/ g ngood))
 (min 1 (/ b nbad)))))))))

   where word is the token whose probability we're
   calculating, good and bad are the hash tables I created
   in the first step, and ngood and nbad are the number of
   nonspam and spam messages respectively."

In reality here, as you can see, all we are dealing with
in this function is the portion of the above code that
does the math. The logic for the above will be held else
where.

[NB] I should probably implement this as an inline
macro, but didn't here for clarity of purpose.

**********************************************************/

double mapTokenP(double g, double nGood, double b, double nBad)
{
return( max(.01,(min(0.99,min(1.0,b/nBad)/min(1.0,min(1.0,(g*2)/nGood)+min(1.0,b/nBad))))) );
}

// d:/bayes/gmsg.eml
// -g d:/bayes/g*.eml -b d:/bayes/b*.eml
// Train
// bayes -g *good.eml -b *bad.eml
// Classify
// bayes message.eml
int main(int argc, char **argv)
{
int nGood, nBad, i, procBad, newGood=0, newBad=0, msgs=0, rc=0;
sqlite *db;
time_t start, mStart, finish, minutes;

nGood = 0;
nBad = 0;

if (argc==3) {

  db = OpenDB(argv[1]);
  if (!db) {
   fprintf(stderr, "ERROR: couldn't open database %s.../n", argv[1]);
   return(128);
  }

  printf("Classifying %s.../n", argv[2]);
  start = time(NULL);
  if (rc = parseMessage(argv[2], db))
   printf("%s should be marked as spam./n", argv[2]);
  else
   printf("%s should NOT be marked as spam./n", argv[2]);
  printf("%u seconds to complete../n", time(NULL)-start);
}
else if (argc>3) {

db = OpenDB(argv[1]);

  start = time(NULL);
  nGood = (int)getHash("nGood", TABLE_COUNT, db);
  nBad = (int)getHash("nBad", TABLE_COUNT, db);

if ((stricmp(argv[2], "-g")) && (stricmp(argv[2], "-b")))
 printf("-g or -b MUST be the first argument!/n");
 else {
 for (i=2;i<argc;i++) {
 if (!stricmp(argv[i], "-g"))
 procBad = 0;
 else if (!stricmp(argv[i], "-b"))
 procBad = 1;
 else {
 printf("Parsing %s messsage: %s...", procBad ? "Spam" : "Ham", argv[i]);
 mStart = time(NULL);
 if (procBad) {
 if (parseFile(argv[i], TABLE_SPAM, db)) {
 nBad++;
 msgs++;
 newBad++;
 }
 }
 else {
 if (parseFile(argv[i], TABLE_HAM, db)) {
 nGood++;
 msgs++;
 newGood++;
 }
 }
 printf("(%u)/n", time(NULL)-mStart);
 }
 }
 }

  printf("Saving message counts (%d Good, %d Bad).../n", nGood, nBad);
  updateHash(TABLE_COUNT, nBad, "nBad", db);
  updateHash(TABLE_COUNT, nGood, "nGood", db);
  printf("Creating temporary word table.../n");
  sqlite_exec(db, "CREATE TEMPORARY TABLE Smoke (Word VARCHAR(256));", 0, 0, 0);
  printf("Populating temporary word table.../n");
  sqlite_exec(db, "INSERT INTO Smoke SELECT Word FROM Spam UNION SELECT Word FROM Ham;", 0, 0, 0);

printf("Creating probability hash table.../n");
createProbabilities(db, nBad, nGood);

printf("Cleaning up SQL database.../n");
sqlite_exec(db, "VACUUM;", 0, 0, 0);

  finish = time(NULL);
  minutes=finish-start;
  printf("Processing of %u messages took %u seconds (%um) to complete./n", msgs, minutes, minutes/60);
  rc=0;
}
else {
  printf("Usage:/n/n");
  printf("To train:    bayes d:/bayes/bayes.db -g *good.msg -b *bad.msg/n");
  printf("To classify: bayes d:/bayes/bayes.db messsage.msg/n");
}

if (db) CloseDB(db);

return(rc);
}

zcg19

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
bayes spam

 #include <stdio.h> #include <stdlib.h> #include <string.h> #include <ctype.h> #include <time.h> #include <sqlite.h> /********************************************************** bayes.C - Craig Morrison, craig@2cah.
复制链接

扫一扫