shell操作
查询数据文件某列有几种 和数目
awk -F',' '{gender[$4]+=1} END {for(i in gender) print i "\t" gender[i]}' users.csv
109
female 14858
male 23242
gender 1
最长的行
awk '{if (length(max)<length( ) ) max=$0}END{print max} ' users.csv
-n 数值 r 反向 -t 分割符 -k 列
sort -nr -t'|' -k 2 lactions.dat
string的长度
expr length "string"
zeppelin操作
启动hadoop集群
start-all.sh
启动zeppelin服务
./bin/zeppelin-daemon.sh start
把多个csv文件放到hdfs下
hdfs dfs -mkdir -p events/data
hdfs dfs -put -f *.csv /events/data
hdfs dfs -ls /events/data
进入zeppelin
192.168.61.107:8000
点击Notebook
Create new note
解释器选择spark
create Note
数据操作
看文件的头
hdfs dfs -cat /events/data/users.csv | head -n 5
spark 加载数据
val dfUsers = spark.read.format("csv").option("header ","true" ).load("hdfs:///events/data/users.csv")
打印数据格式
dfUsers.printSchema
root
|-- _c0: string (nullable = true)
|-- _c1: string (nullable = true)
|-- _c2: string (nullable = true)
|-- _c3: string (nullable = true)
|-- _c4: string (nullable = true)
|-- _c5: string (nullable = true)
|-- _c6: string (nullable = true)
查看是否有重复的数据
dfUsers.count
res12: Long = 38209
dfUsers.select( "user_id" ).distinct.count
res13: Long = 38209
对出生年进行类型转换为int 并过滤不为null和大于1900的数据统计条数
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import spark.implicits._
dfUsers.withColumn("birthyear",$"birthyear".cast(IntegerType)).filter($"birthyear".isNotNull && $"birthyear">lit(1900)).count
对birthyear列类型转换int 并求平均值
val dfUsers1=dfUsers.withColumn("birthyear",$"birthyear".cast(IntegerType))
val dfAvgAge=dfUsers1.select(avg($"birthyear").cast(IntegerType).as("avg_year"))
dfAvgAge.show
+--------+
|avg_year|
+--------+
| 1988|
+--------+
把原来的数据中birthyear中为null的用avg_age替换反之用初始值 并删除avg_age列
crossJoin() 笛卡尔积
val dfFinalUser=dfUsers1.crossJoin(dfAvgAge).withColumn("birthyear",when($"birthyear".isNull,$"avg_year")
.otherwise($"birthyear")).drop("avg_year")
dfFinalUser.show(5)
+----------+------+---------+------+--------------------+------------------+--------+
| user_id|locale|birthyear|gender| joinedAt| location|timezone|
+----------+------+---------+------+--------------------+------------------+--------+
|3197468391| id_ID| 1993| male|2012-10-02T06:40:...| Medan Indonesia| 480|
|3537982273| id_ID| 1992| male|2012-09-29T18:03:...| Medan Indonesia| 420|
| 823183725| en_US| 1975| male|2012-10-06T03:14:...|Stratford Ontario| -240|
|1872223848| en_US| 1991|female|2012-11-04T08:59:...| Tehran Iran| 210|
|3429017717| id_ID| 1995|female|2012-09-10T16:06:...| null| 420|
+----------+------+---------+------+--------------------+------------------+--------+
对性别进行清洗 不是male or female 的替换为 unknow
val df = dfFinaluser.withColumn("gender", when($"gender" ===1it("male") || $"gender"===1it("female"),$"gender")
.otherwise(1it("unknown"
)))
spark-shell --conf spark.executor.memory=24g --conf spark.sql.crossJoin.enabled=true
dfEvents.alias("e").join(dfUsers.alias("u"),$e.user_id"==$"u.user_id ,"left_outer").filter($"u.user_id".isNull).select( "e.user_id").distinct.count
res6:Long=1918266
user_friends.csv 内容为两列 第一列 为user_id,第二列 为多个friend_id 以空格分割的字符串
val dfUserFriends=spark.read.format("csv").option("header",true)
.load("hdfs:///events/data/user_friends.csv")
val df=dfUserFriends.withColumnRenamed("user","user_id")
.withColumn("friend_id",explode(split($"friends"," ")))
.drop("friends")
df.show(2)
+----------+----------+
| user_id| friend_id|
+----------+----------+
|3197468391|1346449342|
|3197468391|3873244116|
+----------+----------+
df.filter($"friend_id".isNotNull).distinct.count
res30: Long = 30386387
val dfEventAttendees=spark.read.format("csv").option("header",true).load("hdfs:///events/data/event_attendees.csv")
dfEventAttendees.printSchema
root
|-- event: string (nullable = true) //举办方id
|-- yes: string (nullable = true) //答应参加的 id_array
|-- maybe: string (nullable = true) //可能去 id_array
|-- invited: string (nullable = true) //邀请的 id_array
|-- no: string (nullable = true) //不去的 id_array
表的优化
第一种
val dfYes = dfEventAttendees.select($"event".as("event_id"),$"yes")
.withColumn("user_id", explode(split($"yes"," ")))
.filter($"user_id".isNotNull).withColumn("attend_type",lit("yes"))
.drop("yes")
val dfMaybe = dfEventAttendees.select($"event".as("event_id"),$"maybe")
.withColumn("user_id", explode(split($"maybe"," ")))
.filter($"user_id".isNotNull).withColumn("attend_type",lit("maybe"))
.drop("maybe")
val dfInvited = dfEventAttendees.select($"event".as("event_id"),$"invited")
.withColumn("user_id", explode(split($"invited"," ")))
.filter($"user_id".isNotNull).withColumn("attend_type",lit("invited"))
.drop("invited")
val dfNo = dfEventAttendees.select($"event".as("event_id"),$"no")
.withColumn("user_id", explode(split($"no"," ")))
.filter($"user_id".isNotNull).withColumn("attend_type",lit("no"))
.drop("no")
dfYes.union( dfMaybe).union(dfInvited ).union( dfNo ).count
dfYes.show(2)
举办方id 用户id 类型(答应赴约)
+----------+----------+-----------+
| event_id| user_id|attend_type|
+----------+----------+-----------+
|1159822043|1975964455| yes|
|1159822043| 252302513| yes|
+----------+----------+-----------+
第二种
val rdd = sc.textFile("hdfs:///events/data/event_attendees.csv")
.map(x => x.split(", ", -1))
.mapPartitionsWithIndex((idx, iter)=> if idx == 0 iter.drop(1) else iter)
.map(l=>l.split(","))
val rddYes = rdd.flatMap(x => x(1).split(" ").map(y =>(x(0),y)))
val rddMaybe = rdd.flatMap(x => x(2).split(" ").map(y => (x(0),y)))
val rddInvited = rdd.flatMap(x => x(3).split(" ").map(y =>(x(0),y)))
val rddNo = rdd.flatMap(x => x(4).split(" ").map(y => (x(0),y)))
val rddAll=rddYes.union(rddMaybe).union(rddInvited).union(rddNo)
spark SQL
统计回复的所用用户(去重)
val sql="""select count(*) from (
select distinct event as event_id,user_id,"yes" as attend_type from event_attendees
lateral view explode(split(yes," ")) t as user_id
union all
select distinct event as event_id,user_id,"maybe" as attend_type from event_attendees
lateral view explode(split(maybe," ")) t as user_id
union all
select distinct event as event_id,user_id,"invited" as attend_type from event_attendees
lateral view explode(split(invited," ")) t as user_id
union all
select distinct event as event_id,user_id,"no" as attend_type from event_attendees
lateral view explode(split(no," ")) t as user_id) t
where user_id is not null""".stripMargin
spark.sql(sql).show
+--------+
|count(1)|
+--------+
|11245008|
+--------+
train.csv
val dfTrain=spark.read.format("csv").option("header",true).load("hdfs:///events/data/train.csv")
dfTrain.printSchema
root
|-- user: string (nullable = true)
|-- event: string (nullable = true)
|-- invited: string (nullable = true)
|-- timestamp: string (nullable = true)
|-- interested: string (nullable = true)
|-- not_interested: string (nullable = true)
dfTrain.groupBy($"user",$"event").agg(count($"timestamp").as ("number")).orderBy($"number".desc).show(3)
+----------+----------+------+
| user| event|number|
+----------+----------+------+
|3519657066|2965442599| 3|
| 661151794| 187728438| 3|
|1895679477|2019748690| 3|
+----------+----------+------+
dfTrain.filter($"user"===lit("3519657066") && $"event"===lit("2965442599")).show(false)
+----------+----------+-------+--------------------------------+----------+--------------+
|user |event |invited|timestamp |interested|not_interested|
+----------+----------+-------+--------------------------------+----------+--------------+
|3519657066|2965442599|0 |2012-10-25 16:55:01.925000+00:00|0 |0 |
|3519657066|2965442599|0 |2012-10-25 04:02:06.024000+00:00|0 |0 |
|3519657066|2965442599|0 |2012-10-24 02:19:01.811000+00:00|0 |0 |
+----------+----------+-------+--------------------------------+----------+--------------+
相同的user 和 event 我们取timestamp最大的那条记录
import org.apache.spark.sql.expressions.Window
val df=dfTrain
.withColumn("rn",row_number() over Window.partitionBy($"user",$"event").orderBy($"timestamp" desc))
.filter($"rn"===lit(1)).drop("rn")
df.filter($"user"===lit("3519657066") && $"event"===lit("2965442599")).show(false)
+----------+----------+-------+--------------------------------+----------+--------------+
|user |event |invited|timestamp |interested|not_interested|
+----------+----------+-------+--------------------------------+----------+--------------+
|3519657066|2965442599|0 |2012-10-25 16:55:01.925000+00:00|0 |0 |
+----------+----------+-------+--------------------------------+----------+--------------+
val df2 =dfTrain.repartition($"user",$"event").sortWithinPartitions($"timestamp".desc).dropDuplicates("user","event")
df2.filter($"user"===lit("3519657066") && $"event"===lit("2965442599")).show(false)
+----------+----------+-------+--------------------------------+----------+--------------+
|user |event |invited|timestamp |interested|not_interested|
+----------+----------+-------+--------------------------------+----------+--------------+
|3519657066|2965442599|0 |2012-10-25 16:55:01.925000+00:00|0 |0 |
+----------+----------+-------+--------------------------------+----------+--------------+