一,首先写一段wordcont的简单demo
package org.jsw.spark;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.function.Consumer;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
public class spark_demo {
private static JavaSparkContext sc;
public static void main(String[] args) {
// TODO Auto-generated method stub
long all=0;
SparkConf conf = new SparkConf().setAppName("firestsparkapp").setMaster("local");
sc = new JavaSparkContext(conf);
Consumer<? super Integer> println = null;
JavaRDD<String> lines = sc
.textFile("/home/develop/workspace/sparkdemo/fasttest_train.txt");
// JavaRDD<Integer> lineLengths = lines.map(s -> s.length());
JavaRDD words = lines.flatMap(new FlatMapFunction<String, String>() {
public Iterator<String> call(String line) throws Exception {
return Arrays.asList(line.split(" ")).iterator();
}
});
JavaPairRDD<String, Integer> ones = words.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) {
return new Tuple2<String, Integer>(s, 1);
}
});
JavaPairRDD<String, Integer> wordCount = ones.reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
List<Tuple2<String, Integer>> list = wordCount.collect();
for (Tuple2<String, Integer> pari : list) {
System.out.println(pari._1() + ":" + pari._2());
all+= pari._2();
}
System.out.println(all);
sc.close();
// int totalLength = lineLengths.reduce((a, b) -> a + b);
// System.out.println(totalLength);
// List<String> output = words.collect();
// for (String tuple : output) {
// System.out.println(tuple.intern());
// }
// System.out.println(lineLengths);
}
}
该程序用于统计预分词文本每个词的数量
apply plugin: 'java'
apply plugin: 'eclipse'
sourceCompatibility = 1.8
targetCompatibility = 1.8
version = '1.0'
repositories {
maven { url "http://maven.nlpcn.org"}
maven { url "http://10.207.2.29:8081/repository/maven-public" }
maven { url "http://maven.oschina.net/content/groups/public/"}
mavenCentral()
}
dependencies {
compile group: 'org.slf4j', name: 'slf4j-api', version: '1.7.22'
compile group: 'org.slf4j', name: 'slf4j-nop', version: '1.7.22'
compile 'org.apache.commons:commons-lang3:3.5'
compile group: 'org.apache.spark', name: 'spark-core_2.10', version: '2.1.1'
compile group: 'commons-collections', name: 'commons-collections', version: '3.2'
testCompile group: 'junit', name: 'junit', version: '4.+'
}
jar {
baseName = 'sparkdemo'
manifest {
attributes 'Main-Class': 'org.jsw.spark.spark_demo'
}
version = '0.3'
}
test { systemProperties 'property': 'value' }
uploadArchives {
repositories {
flatDir { dirs 'repos' }
}
}
二 编写build.gradle 文件
注意 jar域,必须显式提供main class .attributes 'Main-Class': 'org.jsw.spark.spark_demo'
三 提交jar文件到spark集群
./bin/spark-submit \
--class com.leju.spark.spark_demo \
--master spark://10.204.12.31:7077 \
--executor-memory 1G \
--total-executor-cores 100 \
/home/develop/workspace/sparkdemo/build/libs/sparkdemo-0.3.jar
demo全部工程代码见:https://github.com/jiashiwen/sparkdemo