方法一、Scala:
val salaryRDD = sc.textFile("hdfs://192.168.1.171:9999/user/root/input/salaries/Salaries.csv").filter(!_.contains("Id"))
def max(a: Double, b: Double): Double = {if (a > b) {return a} else {return b}}salariesRDD.map(_.split(",")).filter(!_(0).contains("Id")).filter(!_(7).isEmpty).map(line => (line(2).trim.toUpperCase, line(7).toDouble)).reduceByKey(max).saveAsTextFile("hdfs://192.168.1.171:9999/user/root/input/salaries/max.txt")方法二、DataFrame:case class Salary(Id: Int, EmployeeName: String, JobTitle: String, BasePay: Double, OvertimePay: Double, OtherPay: Double, Benefits: Double, TotalPay: Double, TotalPayBenefits: Double, Year: String, Notes: String, Agency: String, Status: String)
val salaryDF = salaryRDD.map(_.split(",")).map(p => Salary(p(0).trim.toInt, p(1), p(2), p(3).trim.toDouble, p(4).trim.toDouble, p(5).trim.toDouble, p(6).trim.toDouble, p(7).trim.toDouble, p(8).trim.toDouble, p(9), p(10), p(11), p(12))).toDF()salaryDF.registerTempTable("salary")
val salaries = sqlContext.sql("SELECT Year, SUM(TotalPay) FROM salary GROUP BY Year")salaries.map(t =>"Year: "+ t(0)).collect().foreach(println)方法三、引用外部包:
spark-shell --packages com.databricks:spark-csv_2.10:1.3.0
import org.apache.spark.sql.SQLContext
val sqlContext = new SQLContext(sc) val df = sqlContext.load("com.databricks.spark.csv", Map("path" -> "hdfs://192.168.1.171:9999/user/root/input/salaries/Salaries.csv", "header" -> "true")) df.select("EmployeeName").save("hdfs://192.168.1.171:9999/user/root/input/salaries/EmployeeName", "com.databricks.spark.csv")方法四、SPARK-SQL:
spark-sql --packages com.databricks:spark-csv_2.10:1.3.0
CREATE TABLE salaries
USING com.databricks.spark.csv OPTIONS (path "hdfs://192.168.1.171:9999/user/root/input/salaries/Salaries.csv", header "true");
SELECT * FROM salaries A WHERE EmployeeName = 'Nelly S Fong';SELECT JobTitle, MAX(TotalPay) FROM salaries GROUP BY JobTitle;