直接上pom.xml
<!--读取excel文件-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.0.1</version>
</dependency>
<dependency>
<groupId>com.crealytics</groupId>
<artifactId>spark-excel_2.11</artifactId>
<version>0.13.1</version>
</dependency>
<dependency>
<groupId>org.apache.xmlbeans</groupId>
<artifactId>xmlbeans</artifactId>
<version>3.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>4.0.1</version>
</dependency>
实现的代码如下
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
object SellOrder {
def main(args: Array[String]): Unit = {
import org.apache.spark.sql._
import com.crealytics.spark.excel._
//冒充ROOT用户
System.setProperty("hadoop.home.dir","D:\\hadoop\\hadoop-2.7.6")
//文件路径,用于自己的本地文件,也可以是HDFS上的
val path="C:\\Users\\XYH\\Desktop\\SellOrder\\SellOder.xls"
//新建SparkSession对象
val spark: SparkSession = SparkSession.builder().appName("sellOrder").master("local[*]")
.getOrCreate()
//过滤不必要日志
spark.sparkContext.setLogLevel("WARN")
//设置自定义表头,根据原来的文件信息的表头进行命名,后续测试出现空指针异常,※所以不推荐使用
val productSchema = StructType(Array(
StructField("Order_date", StringType, nullable = false),
StructField("Order_type", StringType, nullable = false),
StructField("Id", StringType, nullable = false),
StructField("Uid", StringType, nullable = false),
StructField("Date1", StringType, nullable = false),
StructField("Productname", StringType, nullable = false),
StructField("Number", StringType, nullable = false),
StructField("Price", StringType, nullable = false),
StructField("Totalprice", StringType, nullable = false)
))
//读取EXCEL文件,并设置内容
val order: DataFrame = spark.read
.excel(
header = true,// 必需项
dataAddress = "'客户筛选'!A1", //可选,默认值:“ A1” "客户筛选"是下面sheet的名称
timestampFormat = "MM-dd-yyyy" // 可选的默认值:yyyy-mm-dd hh:mm:ss [.fffffffff]
)
//加载表头信息,跟上面的自定义相呼应,但测试都会出现空指针异常所以先放弃,有了解的小伙伴可以帮忙解答下吗?
//.schema(productSchema)
//加载的文件
.load(path)
//对每一列名进行重新命名,方便SQL查找,
//新建列名
.toDF("Order_date","Order_type","Id","Uid","Date1","Productname","Number","Price","Totalprice")
//给表起名字进行SQL查询操作
order.createTempView("SellOrder")
import spark.implicits._
//下面基本是采用SQL来查找需要信息
val sql =
"""
|select Order_date,Uid,Productname,Totalprice from SellOrder
|""".stripMargin
//用于测试打印检查
//spark.sql(sql).show(400)
//统计所有客户数
println("------------------")
val customer =
"""
|select count(distinct Uid) from SellOrder
|""".stripMargin
spark.sql(customer).show()
//统计不同客户一年销售额
val usersql=
"""
|with t1 as (select Order_date,Uid,Productname,Totalprice from SellOrder),
|t2 as (select Uid,sum(Totalprice) total from t1 group by Uid)
|select Uid,total,row_number() over(sort by total desc) rn from t2
|""".stripMargin
//spark.sql(usersql).show(400)
val userproductDF: DataFrame = spark.sql(usersql)
//导出EXCEL文件语句输出.xlsx文件
userproductDF.write
.format("com.crealytics.spark.excel")
.option("dataAddress", "'Sheet1'!A1")//指定位置,默认:A1
.option("header", "true")
.option("dateFormat", "yyyy-mm-dd") // 可选, 默认: yy-m-d h:mm
.option("timestampFormat", "yyyy-mm-dd") // 可选, 默认: yyyy-mm-dd hh:mm:ss.000
.mode("append") // 可选, 默认: overwrite.
//保存地方可以是本地也可以是HDFS看个人,本地天界file,保存在HDFS 就不加
.save("file:///C:\\Users\\XYH\\Desktop\\SellOrder\\userproduct_1.xlsx")
}
}
代码实现后就是用office文件打开就可以,还需要排序筛选服务,根据EXCEL也可以进行相应操作!
真心不难!!
如果小企业对自己数据有需求服务可以私信!!!