一、编程式方法
(一)java版本
public class WordPro {
private static SparkSession gloableSpark;
private static Logger logger = LoggerFactory.getLogger(WordPro.class);
public static void main(String[] args) {
//获取sparkSession
SparkSession spark = SparkSession.builder().appName("wordPro").master("local[4]").getOrCreate();
gloableSpark = spark;
JavaRDD<Row> rowRDD = spark.read().text("D:\\data\\testdatas\\sparksql\\pro\\keyword.txt").toJavaRDD();
logger.info("完整数据:");
rowRDD.collect().forEach(line -> System.out.println(line));
//将长度不符合规范的数据先过滤掉
JavaRDD<Row> programRdd = rowRDD.filter(new Function<Row, Boolean>() {
private static final long serialVersionUID = 975739302260154160L;
@Override
public Boolean call(Row v1) throws Exception {
boolean result = v1.get(0).toString().split("\t").length == 6;
return result;
}
}).map(new Function<Row, Row>() {
//将数据根据tab进行拆分重组成为符合StructType的JavaRdd<Row>
@Override
public Row call(Row v1) throws Exception {
String[] datas = v1.get(0).toString().split("\t");
Row row = RowFactory.create(
datas[0],
datas[1],
datas[2],
datas[3],
datas[4],
datas[5]
);
return row;
}
});
//根据StructField来创建StructType
List<StructField> structFieldList = new ArrayList<>();
structFieldList.add(DataTypes.createStructField("date", DataTypes.StringType, true));
structFieldList.add(DataTypes.createStructField("name", DataTypes.StringType, true));
structFieldList.add(DataTypes.createStructField("condition", DataTypes.StringType, true));
structFieldList.add(DataTypes.createStructField("city", DataTypes.StringType, true));
structFieldList.add(DataTypes.createStructField("platform", DataTypes.StringType, true));
structFieldList.add(DataTypes.createStructField("version", DataTypes.StringType, true));
StructType structType = DataTypes.createStructType(structFieldList);
//通过structType将rdd转换为Dataset
Dataset<Row> infoDataset = spark.createDataFrame(programRdd, structType);
//创建临时表
infoDataset.createOrReplaceTempView("info");
//构造查询条件
Map<String, String> params = new HashMap<>(2);
params.put("city", "nanjing");
selectByCondition(params);
}
private static void selectByCondition(Map<String, String> params) {
String city = params.get("city");
String platform = params.get("platform");
//拼接查询sql
StringBuffer sqlBuffer = new StringBuffer("select * from info where 1=1 ");
sqlBuffer = city == null ? sqlBuffer : sqlBuffer.append(" and city = '" + city + "'");
sqlBuffer = platform == null ? sqlBuffer : sqlBuffer.append(" and platform = '" + platform + "'");
String sql = sqlBuffer.toString();
logger.info("查询sql:" + sql);
gloableSpark.sql(sql).show();
}
}
注释都已经比较清楚了,下面是相应的输出结果(几种方式的结果都相同,就只贴一次):
完整数据:
[2015-10-01 jack water beijing android 2.0]
[2015-10-01 white barbecue nanjing iphone 2.0]
[2015-10-02 white seafood beijing android 1.0]
[2015-10-02 leo seafood beijing android 1.0]
[2015-10-02 marry seafood beijing android 1.0]
[2015-10-02 tom seafood beijing android 1.0]
[2015-10-02 jack seafood beijing android 1.0]
[2015-10-02 jack seafood beijing android 1.0]
[2015-10-02 tom water beijing android 1.2]
[2015-10-02 leo water beijing android 1.2]
[2015-10-02 jack water beijing android 1.2]
[2015-10-02 jack water beijing android 1.2]
[2015-10-02 leo barbecue beijing android 1.5]
[2015-10-02 marry barbecue beijing android 1.5]
[2015-10-02 marry barbecue beijing android 1.5]
[2015-10-02 jack toy beijing android 2.0]
[2015-10-02 white tour nanjing iphone 2.0]
查询结果:
+----------+-----+---------+-------+--------+-------+
| date| name|condition| city|platform|version|
+----------+-----+---------+-------+--------+-------+
|2015-10-01|white| barbecue|nanjing| iphone| 2.0|
|2015-10-02|white| tour|nanjing| iphone| 2.0|
+----------+-----+---------+-------+--------+-------+
(二)scala版本
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}
import org.hrong.scala.sparksql.WordPro.spark
object WordPro extends App {
case class Info(date: String, name: String, condition: String, city: String, platform: String, version: String)
val spark = SparkSession.builder()
.appName("wordProject")
.master("local[4]")
.getOrCreate()
val originalData = spark.read.text("D:\\data\\testdatas\\sparksql\\pro\\keyword.txt").rdd
val infoRdd = originalData.filter(row => row.getString(0).split("//s+").length != 0)
.map(line => {
val arr = line.getString(0).split("\t")
val date = arr(0)
val name = arr(1)
val condition = arr(2)
val city = arr(3)
val platform = arr(4)
val version = arr(5)
Row(date,name,condition,city,platform,version)
})
val structType = StructType(
StructField("date",StringType,false)::
StructField("name",StringType,false)::
StructField("condition",StringType,false)::
StructField("city",StringType,false)::
StructField("platform",StringType,false)::
StructField("version",StringType,false)::
Nil
)
val infoDF = spark.createDataFrame(infoRdd,structType)
infoDF.createOrReplaceTempView("info")
val params = Map(
"city" -> "nanjing"
)
println("所有数据:")
spark.sql("select * from info").show()
selectInfoByCondition(params)
def selectInfoByCondition(param: Map[String, String]): Unit = {
val date = param.getOrElse("date", null)
val city = param.getOrElse("city", null)
val platform = param.getOrElse("platform", null)
var sql = "select * from info where 1=1 "
if (date != null) {
sql += " and date = '"+date+"'"
}
if (city != null) {
sql += " and city = '"+city+"'"
}
if (platform != null) {
sql += " and platform = '"+platform+"'"
}
spark.sql(sql).show()
}
}
二、反射方式
(一)java版本
public class WordPro {
private static SparkSession gloableSpark;
private static Logger logger = LoggerFactory.getLogger(WordPro.class);
public static void main(String[] args) {
//获取sparkSession
SparkSession spark = SparkSession.builder().appName("wordPro").master("local[4]").getOrCreate();
gloableSpark = spark;
JavaRDD<Row> rowRDD = spark.read().text("D:\\data\\testdatas\\sparksql\\pro\\keyword.txt").toJavaRDD();
logger.info("完整数据:");
rowRDD.collect().forEach(line -> System.out.println(line));
/**
* 反射转换
*/
JavaRDD<Info> infoJavaRDD = rowRDD.filter(new Function<Row, Boolean>() {
private static final long serialVersionUID = 975739302260154160L;
@Override
public Boolean call(Row v1) throws Exception {
boolean result = v1.get(0).toString().split("\t").length == 6;
return result;
}
}).map(new Function<Row, Info>() {
private static final long serialVersionUID = -6416816230919347971L;
@Override
public Info call(Row v1) throws Exception {
String[] datas = v1.get(0).toString().split("\t");
return new Info(datas[0], datas[1], datas[2], datas[3], datas[4], datas[5]);
}
});
Dataset<Row> infoDataset = spark.createDataFrame(infoJavaRDD, Info.class);
//创建临时表
infoDataset.createOrReplaceTempView("info");
//构造查询条件
Map<String, String> params = new HashMap<>(2);
params.put("city", "nanjing");
selectByCondition(params);
}
private static void selectByCondition(Map<String, String> params) {
String city = params.get("city");
String platform = params.get("platform");
//拼接查询sql
StringBuffer sqlBuffer = new StringBuffer("select * from info where 1=1 ");
sqlBuffer = city == null ? sqlBuffer : sqlBuffer.append(" and city = '" + city + "'");
sqlBuffer = platform == null ? sqlBuffer : sqlBuffer.append(" and platform = '" + platform + "'");
String sql = sqlBuffer.toString();
logger.info("查询sql:" + sql);
gloableSpark.sql(sql).show();
}
}
(二)scala版本
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SparkSession}
object WordPro extends App {
case class Info(date: String, name: String, condition: String, city: String, platform: String, version: String)
val spark = SparkSession.builder()
.appName("wordProject")
.master("local[4]")
.getOrCreate()
val originalData = spark.read.text("D:\\data\\testdatas\\sparksql\\pro\\keyword.txt").rdd
import spark.implicits._
val infoDF = originalData.filter(row => row.getString(0).split("//s+").length != 0)
.map(line => {
val arr = line.getString(0).split("\t")
val date = arr(0)
val name = arr(1)
val condition = arr(2)
val city = arr(3)
val platform = arr(4)
val version = arr(5)
new Info(date, name, condition, city, platform, version)
}).toDF()
infoDF.createOrReplaceTempView("info")
val params = Map(
"city" -> "nanjing"
)
println("所有数据:")
spark.sql("select * from info").show()
selectInfoByCondition(params)
def selectInfoByCondition(param: Map[String, String]): Unit = {
val date = param.getOrElse("date", null)
val city = param.getOrElse("city", null)
val platform = param.getOrElse("platform", null)
var sql = "select * from info where 1=1 "
if (date != null) {
sql += " and date = '"+date+"'"
}
if (city != null) {
sql += " and city = '"+city+"'"
}
if (platform != null) {
sql += " and platform = '"+platform+"'"
}
spark.sql(sql).show()
}
}
欢迎加群大家一起学习大数据~群里还有很多资料可供参考~
大数据大佬菜鸡交流群 575419003