准备测试数据
数据来源:搜狗实验室搜狗实验室(Sogou Labs)
上传数据到namenode节点的“/”目录
hadoop fs -put /opt/SogouQ.sample.txt /
准备Scala环境:
在IDEA环境中添加Scala插件
在项目中增加scala全局库
示例
查找热词一致的数据示例:
scala代码:
package edu.spark.etl
import org.apache.spark.{SparkConf, SparkContext}
//scala的main函数必须在object类型的class中才能够运行
object HotWordEtlScala {
def main(args: Array[String]): Unit = {
val config=new SparkConf().setMaster("local[1]").setAppName("Hot Word");
val sc=new SparkContext(config);
//宿主主机无法访问虚拟机中容器中的hdfs上的文件,此处用本地文件答题,以后解决
val lines=sc.textFile("data/SogouQ.sample.txt");
val total=lines.count();
//scala中的split后的数组下标要放在()中,注意不是[]
val hotcount=lines.map(line=>line.split("\t")(3))
.map(word=>word.split(" ")(0).equals(word.split(" ")(1)))
.filter(x=>x.equals(true))
.count();
println("hot ratio is :"+(hotcount.toFloat/total.toFloat));
}
}
Spark SQL 处理上述类似ETL任务
package edu.spark.etl
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
object HotWordSQL {
def main(args: Array[String]): Unit = {
val config=new SparkConf().setMaster("local").setAppName("Hot Word SQL");
val sc=new SparkContext(config);
val lines=sc.textFile("hdfs://10.10.10.153:8020/SogouQ.sample.txt");
//println(lines.count());
val row=lines.map(x=>{
val arr=x.split("\t")(3);
Row(arr.split(" ")(0).toInt,arr.split(" ")(1).toInt);
});
//StructType的参数需要是列表、数组,::是列表链接符号
val structtype=StructType(
StructField("rank",IntegerType,false)::
StructField("click",IntegerType,false)::Nil
);
//创建SparkSession,用它来创建dataframe
val ss=SparkSession.builder().getOrCreate();
val df=ss.createDataFrame(row,structtype);
df.createOrReplaceTempView("tb");
//利用dataframe执行SQL语句,对数据操作
val re=df.sqlContext.sql("select count(if(t.rank==t.click,1,null)) as hit," +
"count(1) as total from tb as t");
//结果显示
re.show();
//取出每行,逐个字段操作
val next=re.toLocalIterator().next();
val hit=next.getAs[Long]("hit");
val total=next.getAs[Long]("total");
println("========="+hit+"========"+total);
println(hit.toFloat/total.toFloat);
}
}