实现的需求: 1. 读csv将Tiler中含有T的数据过滤出来。 2. 创建flag列,将201903>201902时flag值为up否则为down 3. 将结果插入到MySql数据库。 废话不多说,直接上代码: -------------------------------------------------------------------------------- import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import java.util.HashMap; import java.util.Properties; public class SparkCsvDemo { public static void main(String[] args) {
String hdfsInAddress = "D:\\DevTemp\\AWS\\";//"hdfs://192.168.209.129:9000/"; //server ip String inputAddress = "";//"in/"; String csvFileName="emr-demo-data.csv";
SparkConf conf = new SparkConf().setMaster("local").setAppName("TestSpark"); System.out.println("=================="); JavaSparkContext sc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(sc); HashMap<String,String> options = new HashMap<String,String> (); options.put("header", "true");//设置第一行为头 options.put("inferSchema", "true");//设置自动分析片段类型 //options.put("path", hdfsInAddress + inputAddress + filePath); options.put("path", this.hdfsInAddress + this.inputAddress + this.csvFileName); options.put("dateFormat","YYYY-MM-DD"); System.out.println("打印上传文件在hdfs的路径:"+hdfsInAddress + inputAddress + csvFileName); /****声明字段类型****/ StructField structFields[] = new StructField[9]; structFields[0] = DataTypes.createStructField("Tier", DataTypes.StringType,true); structFields[1] = DataTypes.createStructField("SellerCode",DataTypes.StringType,true); structFields[2] = DataTypes.createStructField("SellerName",DataTypes.StringType,true); structFields[3] = DataTypes.createStructField("DataSource",DataTypes.StringType,true); structFields[4] = DataTypes.createStructField("SellerProvince",DataTypes.StringType,true); structFields[5] = DataTypes.createStructField("_201901",DataTypes.DoubleType,true); structFields[6] = DataTypes.createStructField("_201902",DataTypes.DoubleType,true); structFields[7] = DataTypes.createStructField("_201903",DataTypes.DoubleType,true); structFields[8] = DataTypes.createStructField("flag",DataTypes.StringType,true); StructType structType = new StructType(structFields); Dataset dataFrame = sqlContext.load("com.databricks.spark.csv", structType, options); // DataFrame cars = (new CsvParser()).withUseHeader(true).csvFile(sqlContext, "cars.csv");//通过CsvParser里面的函数来读取CSV文件 dataFrame.registerTempTable("result"); StringBuffer sparkSql = new StringBuffer("select "); sparkSql.append("Tier"); sparkSql.append(", SellerCode"); sparkSql.append(", SellerName"); sparkSql.append(", DataSource"); sparkSql.append(", SellerProvince"); sparkSql.append(", _201901"); sparkSql.append(", _201902"); sparkSql.append(", _201903"); sparkSql.append(", if(_201903>_201902,'up','down') as flag"); sparkSql.append(" from result"); Dataset resultFrame=sqlContext.sql(sparkSql.toString() ); //resultFrame.createOrReplaceTempView("resultView");//创建视图 //System.out.println("***************用Dataset打印*peopleScore********"+resultFrame.limit(10).showString(20,0,false)); System.out.println("******print schema *******"); resultFrame.printSchema(); System.out.println("*************"); //resultFrame.select("SellerName").show(); System.out.println("*************"); //Tier SellerCode SellerName DataSource SellerProvince _201901 _201902 _201903 Dataset df = resultFrame.select( resultFrame.col("Tier"), resultFrame.col("SellerCode"), resultFrame.col("SellerName"), resultFrame.col("DataSource"), resultFrame.col("SellerProvince"), resultFrame.col("_201901"), resultFrame.col("_201902"), resultFrame.col("_201903"), resultFrame.col("flag")); df = df.filter(df.col("Tier").contains("T"));//where condition:equalTo/ //df = df.filter((df.col("_201902").cast(DataTypes.FloatType)).gt((df.col("201901").cast(DataTypes.FloatType))));//gt 大于 //df = df.orderBy(df.col("_201902").cast(DataTypes.FloatType).asc_nulls_first());//转换类型并升序 //df.groupBy("age").count();//分组 df.show(); /*************将结果写入到 mysql 数据库******************/ //数据库连接 String url = "jdbc:mysql://127.0.0.1:3306/hive?useUnicode=true&characterEncoding=utf-8"; Properties connectionProperties = new Properties(); connectionProperties.put("user","root"); connectionProperties.put("password","123456"); connectionProperties.put("driver","com.mysql.jdbc.Driver"); /**插入数据库表中**/ df.write().mode(SaveMode.Overwrite).jdbc(url,"t_result",connectionProperties);//Overwrite会覆盖数据和表结构 sc.stop(); } } ------------------------------------------------------------- 测试用csv文件 插入MySql结果: 以上测试结果好用。由于摸索的实现,没有将方法提取,需要的小朋友自己修改吧。
IEDA编写的 java SparkAPI 读取CSV文件,用spark sql简单处理结果并保存到MySQL数据库的代码实现
最新推荐文章于 2023-05-04 21:19:04 发布