cd $MAHOUT_HOME
mkdir -p corpus/spam-assassin
curl -O \
http://spamassassin.apache.org/publiccorpus/20021010_spam.tar.bz2
curl -O \
http://spamassassin.apache.org/publiccorpus/20021010_easy_ham.tar.bz2
tar xjf 20021010_spam.tar.bz2
tar xjf 20021010_easy_ham.tar.bz2
ls -1 spam/* |wc -l
ls -1 easy_ham/* |wc -l
mkdir -p train/easy_ham train/spam
mkdir -p test/easy_ham test/spam
垃圾邮件和不合格邮件的训练集测试集分布
ls -1 spam/* | head -n 400 \
|while read file;do cp $file train/$file;done
ls -1 spam/* |tail -n 100 \
|while read file;do cp $file test/$file;done
ls -1 easy_ham/* | head -n 400 \
|while read file;do cp $file train/$file;done
ls -1 easy_ham/* |tail -n 100 \
|while read file;do cp $file test/$file;done
export HADOOP_HOME=/usr/lib/hadoop
转换数据格式
$MAHOUT_HOME/bin/mahout prepare20newsgroups \
-p train/ \
-o train_mahout/ \
-a org.apache.mahout.vectorizer.DefaultAnalyzer \
-c UTF-8
$MAHOUT_HOME/bin/mahout prepare20newsgroups \
-p test/ \
-o test_mahout/ \
-a org.apache.mahout.vectorizer.DefaultAnalyzer \
-c UTF-8
hadoop fs -put train_mahout test_mahout
训练数据生成模型
$MAHOUT_HOME/bin/mahout trainclassifier \
-i train_mahout \
-o model \
-type cbayes \
-ng 1 \
-source hdfs
$MAHOUT_HOME/bin/mahout testclassifier \
-d test_mahout \
-m model \
-type cbayes \
-ng 1 \
-source hdfs \
-method mapreduce
mkdir -p corpus/spam-assassin
curl -O \
http://spamassassin.apache.org/publiccorpus/20021010_spam.tar.bz2
curl -O \
http://spamassassin.apache.org/publiccorpus/20021010_easy_ham.tar.bz2
tar xjf 20021010_spam.tar.bz2
tar xjf 20021010_easy_ham.tar.bz2
ls -1 spam/* |wc -l
ls -1 easy_ham/* |wc -l
mkdir -p train/easy_ham train/spam
mkdir -p test/easy_ham test/spam
垃圾邮件和不合格邮件的训练集测试集分布
ls -1 spam/* | head -n 400 \
|while read file;do cp $file train/$file;done
ls -1 spam/* |tail -n 100 \
|while read file;do cp $file test/$file;done
ls -1 easy_ham/* | head -n 400 \
|while read file;do cp $file train/$file;done
ls -1 easy_ham/* |tail -n 100 \
|while read file;do cp $file test/$file;done
export HADOOP_HOME=/usr/lib/hadoop
转换数据格式
$MAHOUT_HOME/bin/mahout prepare20newsgroups \
-p train/ \
-o train_mahout/ \
-a org.apache.mahout.vectorizer.DefaultAnalyzer \
-c UTF-8
$MAHOUT_HOME/bin/mahout prepare20newsgroups \
-p test/ \
-o test_mahout/ \
-a org.apache.mahout.vectorizer.DefaultAnalyzer \
-c UTF-8
hadoop fs -put train_mahout test_mahout
训练数据生成模型
$MAHOUT_HOME/bin/mahout trainclassifier \
-i train_mahout \
-o model \
-type cbayes \
-ng 1 \
-source hdfs
$MAHOUT_HOME/bin/mahout testclassifier \
-d test_mahout \
-m model \
-type cbayes \
-ng 1 \
-source hdfs \
-method mapreduce
$MAHOUT_HOME/bin/mahout trainclassifier \
-i train_mahout \
-o model \
-type cbayes \
-ng 1 \
-source hdfs
$MAHOUT_HOME/bin/mahout testclassifier \
-d test_mahout \
-m model1 \
-type cbayes \
-ng 1 \
-source hdfs \
-method mapreduce
-i train_mahout \
-o model \
-type cbayes \
-ng 1 \
-source hdfs
$MAHOUT_HOME/bin/mahout testclassifier \
-d test_mahout \
-m model1 \
-type cbayes \
-ng 1 \
-source hdfs \
-method mapreduce