Spark使用
- Mven eclipse的安装 http://blog.csdn.net/qjyong/article/details/9098213
- eclipse 建立maven工程
File->New->Project->Maven->Maven Project->quick start
- 添加依赖
https://mvnrepository.com/ 找依赖库
<dependency> <!-- Spark dependency -->
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>1.2.0</version>
</dependency>
<dependency> <!-- Hadoop dependency -->
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.10</artifactId>
<version>1.0.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.10</artifactId>
<version>1.5.2</version>
</dependency>
- 编写程序
3.1利用sqoop数据的导入到Hive HDFS
sqoop import
--connect jdbc:mysql://49.123.21.100:3306/bbk
--username root
--password root
--table C
--fields-terminated-by '\t' -m 1
3.2 编写spark
SparkConf sparkConf = new SparkConf().setAppName("TestKMeans");
JavaSparkContext sc = new JavaSparkContext(sparkConf);
String filePath = "hdfs:/user/kpnm/C ";
JavaRDD<String> lines = sc.textFile(filePath);
3.2 函数式编程
类 实现方法 用途
Function<T,R> R call(T) map() filter()
例1 filter
JavaRDD <String> dongtang = lines.filter(new Containsdt())
Class Containsdt implements Function<String ,Boolean>(){
Public Boolean call(String X){
return X.contains(“东塘店”)
}
}
例2 map
SparkConf sparkConf = new SparkConf().setAppName("sql").setMaster("local")
JavaSparkContext ctx = new JavaSparkContext(sparkConf);
JavaRDD <Integer> data= ctx.parallelize(Arrays.asList(1,2,3,4,5,6));
RDD=
1 2
2 3
3
4
5
6 7
idd.map(fun);
fun=new Function<Integer,Integer>(){
public Integer call(Integer x){
return x+1;
}
}
类 实现方法 用途
Function<T1,T2,R> R call(T1,T2) aggregate() fold()
例3 aggregate
List<Integer> data = Arrays.asList(5, 1, 1, 4, 4, 2, 2);
JavaRDD<Integer> javaRDD = ctx.parallelize(data,3);
Integer aggregateRDD = javaRDD.aggregate(2, new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
}, new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
System.out.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + aggregateRDD);
//结果输出 27
例4 fold
List<Integer> data = Arrays.asList(5, 1, 1, 4, 4, 2, 2);
JavaRDD<Integer> javaRDD = ctx.parallelize(data,3);
System.out.println(javaRDD.fold(1, new Function2<Integer,Integer,Integer>(){
public Integer call(Integer x,Integer y){
return x+y;
}
}));
//输出结果 23
类 实现方法 用途 FlatMapFunction<T,R> Iterable<R> call(T) flatMap()
例4 flatMap
JavaRDD<String> sjdd=ctx.parallelize(Arrays.asList("hello world how are you","i am fine","thanks for you"));
sjdd.foreach(new VoidFunction<String>(){
public void call(String x){
System.out.println(x);
}
});
sjdd=sjdd.flatMap(new FlatMapFunction<String,String>(){
public Iterable<String> call(String x){
return Arrays.asList(x.split(" "));
}
});
System.out.println("处理之后的数据");
sjdd.foreach(new VoidFunction<String>(){
public void call(String x){
System.out.println(x);
}
});
之前结果:
"hello world how are you"
"i am fine"
"thanks for you"
之后结果
hello
World
How
Are
you
i
am
fine
thanks
For
You
PariRDD 或者DoubleRDD 相关的函数和类同样需要 一些的相关接口类
5 提交作业
Maven 打包
Maven install 将所有的依赖包充入jar中
sudo spark-submit \
--class com.mycompany.app.yang.App \
--executor-memory 5G \
--total-executor-cores 5 \
--driver-class-path /home/kpnm/mysql-connector-java-5.1.41-bin.jar\
/home/kpnm/yxs/yang-0.0.1-SNAPSHOT.jar
其中红色的表示必须。
趋势分析:
package com.mycompany.app.yang;
import scala.Tuple2;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Pattern;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.DataFrameReader;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
public class App
{
private static final Pattern SPACE = Pattern.compile(" ");
public static void main(String[] args) throws Exception {
SparkConf sparkConf = new SparkConf().setAppName("sql").setMaster("local").set("spark.kryoserializer.buffer.max","128");
JavaSparkContext ctx = new JavaSparkContext(sparkConf);
SQLContext sqlContext = new SQLContext(ctx);
HiveContext hiveCtx=new HiveContext(ctx);
DataFrame rdd;
rdd=hiveCtx.sql("select SHOP,CATE,ACCTURE,PERIOD,ACCOUNT,collect_set(VALUE) from C group by SHOP,CATE,ACCTURE,ACCOUNT,PERIOD,ACCOUNT");
JavaRDD<Row> jdd=rdd.toJavaRDD();
jdd =jdd.map(new linearegression()).collect());
jdd.saveAsTextFile("/Home/kpnm/yxs/result.txt");
ctx.stop();
}
};
class linearegression implements Function<Row, List<Double>>{
public List<Double> call(Row s){
String str=s.get(5).toString();
String datas=str.substring(13, str.length()-1);
String[] data=datas.split(",");
List<Double> list= new ArrayList <Double>();
double k;
for(int i=0;i<data.length-2;i++){
k=(Double.valueOf(data[i+2]) - Double.valueOf(data[i]))/2;
list.add(Double.valueOf(k));
}
return list;
}
}
查看任务执行情况