Spark实现简单的垃圾邮件分类--JAVA源码

Spark实现简单的垃圾邮件分类–JAVA源码


代码部分

package cn.cc.spark;

import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.classification.LogisticRegressionModel;
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD;
import org.apache.spark.mllib.feature.HashingTF;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.regression.LabeledPoint;



public final class Spam {
	  public static void main(String[] args) {
	    SparkConf sparkConf = new SparkConf().setAppName("垃圾邮件训练").setMaster("local[*]");
	    
	    JavaSparkContext sc = new JavaSparkContext(sparkConf);

	    JavaRDD<String> spam = sc.textFile("hdfs://localhost:8020/sample/spam/spmsga1.eml");
	    
	    JavaRDD<String> mail = sc.textFile("hdfs://localhost:8020/sample/mail/3-1msg1.txt");
	    
	    // 实例化HashingTF,用于将邮件文本映射为x个特征向量
	    final HashingTF tf = new HashingTF(10000);

	    //map:将RDD中的所有元素应用于Function并返回处理后的新RDD
	    //Function<A,B> 表示输入类型为A,输出类型为B
	    JavaRDD<LabeledPoint> positiveData = setLabeledPoint(spam, tf, 1.0);
	    JavaRDD<LabeledPoint> negativeData = setLabeledPoint(mail, tf, 0.0);
	    
	    //将阴性阳性数据整合为训练集
	    JavaRDD<LabeledPoint> trainingData = positiveData.union(negativeData);
	    
	    // 缓存训练数据RDD
	    trainingData.cache(); 

	    //使用梯度下降法进行逻辑回归训练数据
	    LogisticRegressionWithSGD lrLearner = new LogisticRegressionWithSGD();
	    LogisticRegressionModel model = lrLearner.run(trainingData.rdd());

	    //以垃圾邮件和正常邮件作为例子测试
	    //对数据同样的HashingTF进行特性转换
	    Vector positiveSample = tf.transform(filterText(sc, "hdfs://localhost:8020/sample/spam/spmsga2.eml"));
	    predictionResult("spmsga2.eml", model.predict(positiveSample));
	    Vector negativeSample = tf.transform(filterText(sc, "hdfs://localhost:8020/sample/mail/3-1msg2.txt"));
	    predictionResult("3-1msg2.txt", model.predict(negativeSample));
	    sc.stop();
	    sc.close();
	  }
	  
	  public static void predictionResult(String emailName,double predictValue) {
		  if (predictValue==1.0) {
				Logger.getGlobal().info(emailName+"邮件的预测结果为垃圾邮件"+predictValue);
			}else {
				Logger.getGlobal().info(emailName+"预测结果为正常邮件"+predictValue);
			}
	  }
	  /**
	   * 设置阴性阳性标记点
	   * @param rdd
	   * @param tf
	   * @param lable
	   * @return
	   */
	  public static JavaRDD<LabeledPoint> setLabeledPoint(JavaRDD<String> rdd,HashingTF tf,Double lable ){
		  return rdd.map(new Function<String, LabeledPoint>() {
			private static final long serialVersionUID = 1L;
				//重写call方法,实现转换,输入参数为String,输出参数为LabeledPoint
			    	@Override public LabeledPoint call(String email) {
			    		List<String> list = new LinkedList<>();
			    		//匹配有效字符
			    		Pattern pattern = Pattern.compile("[\\w]*");
			    		Matcher matcher = pattern.matcher(email);
			    		while(matcher.find()) {
			    			list.add(matcher.group());
			    		}
			    	  //设置标注点
			        return new LabeledPoint(lable, tf.transform(list));
			      }
			    });
	  }
	  
	  public static List<String> filterText(JavaSparkContext sc,String path){
		  List<String> list = new LinkedList<>();
		  List<String> data = sc.textFile(path).collect();
		  Iterator<String> iter = data.iterator();
		  while(iter.hasNext()) {
			  String str = iter.next();
			  if (str.trim().isEmpty()) {
				iter.remove();
			}else {
				list.addAll(Arrays.asList(str.split(" ")));
			}
		  }
		return list;
	  }
	}

相关依赖

 <dependencies>
    <dependency> 
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-core_2.10</artifactId>
      <version>1.3.1</version>
      <scope>provided</scope>
    </dependency>
    <dependency> 
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-sql_2.10</artifactId>
      <version>1.3.1</version>
      <scope>provided</scope>
    </dependency>
    <dependency> 
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-hive_2.10</artifactId>
      <version>1.3.1</version>
      <scope>provided</scope>
    </dependency>
    <dependency> 
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-streaming_2.10</artifactId>
      <version>1.3.1</version>
    </dependency>
    <dependency> 
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-streaming-kafka_2.10</artifactId>
      <version>1.3.1</version>
    </dependency>
    <dependency> 
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-mllib_2.10</artifactId>
      <version>1.3.1</version>
    </dependency>
    <dependency> 
      <groupId>com.datastax.spark</groupId>
      <artifactId>spark-cassandra-connector_2.10</artifactId>
      <version>1.0.0-rc5</version>
    </dependency>
    <dependency> 
      <groupId>com.datastax.spark</groupId>
      <artifactId>spark-cassandra-connector-java_2.10</artifactId>
      <version>1.0.0-rc5</version>
    </dependency>
    <dependency> 
      <groupId>org.elasticsearch</groupId>
      <artifactId>elasticsearch-hadoop-mr</artifactId>
      <version>2.0.0.RC1</version>
    </dependency>
    <dependency> 
      <groupId>org.eclipse.jetty</groupId>
      <artifactId>jetty-client</artifactId>
      <version>8.1.14.v20131031</version>
    </dependency>
    <dependency>
      <groupId>com.fasterxml.jackson.core</groupId>
      <artifactId>jackson-databind</artifactId>
      <version>2.3.3</version>
    </dependency>
    <dependency>
      <groupId>org.apache.commons</groupId>
      <artifactId>commons-lang3</artifactId>
      <version>3.0</version>
    </dependency>
    <dependency>
      <groupId>net.sf.opencsv</groupId>
      <artifactId>opencsv</artifactId>
      <version>2.0</version>
    </dependency>
    <dependency>
      <groupId>org.scalatest</groupId>
      <artifactId>scalatest_${scala.binary.version}</artifactId>
      <version>2.2.1</version>
    </dependency>
  </dependencies>
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值