1、时间处理,提取日期、小时:
from datetime import date
train = train.toDF("label", "uId", "adId", "operTime", "siteId", "slotId", "contentId", "netType")
print(train.dtypes)
train.show(n=20)
train = train.withColumn("operDate",col("operTime").cast("date"))
train = train.withColumn("hour",hour(col("operTime")))
# select operDate range of "2019-03-26 -- 2019-03-31",6 days data
train = train.filter((train_1.operDate >="2019-03-26") & (train_1.operDate <="2019-03-31"))
train.show(n=10)
train.persist()
提取日期、小时,效果如下
2、分组排序,并新增一列:
参考:pyspark系列--统计基础https://blog.csdn.net/suzyu12345/article/details/79673557#6-%E6%9C%80%E5%A4%A7%E5%80%BC%E6%9C%80%E5%B0%8F%E5%80%BC
from pyspark.sql import Window
train = train.withColumn('ad_counts', row_number().over(Window.partitionBy(['uId','adId']).orderBy('operDate')))
3、 空值处理:
# View null column
train.filter("adId is null").select("adId").limit(10).show()
train = train.na.drop()
# 查看空列行数
bad_rows_df = logs_df.filter(logs_df['host'].isNull()|
logs_df['timestamp'].isNull() |
logs_df['method'].isNull() |
logs_df['endpoint'].isNull() |
logs_df['status'].isNull() |
logs_df['content_size'].isNull()|
logs_df['protocol'].isNull())
bad_rows_df.count()
# 查看Dataframe每列空行行数
from pyspark.sql.functions import col
from pyspark.sql.functions import sum as spark_sum
def count_null(col_name):
return spark_sum(col(col_name).isNull().cast('integer')).alias(col_name)
# Build up a list of column expressions, one per column.
exprs = [count_null(col_name) for col_name in logs_df.columns]
# Run the aggregation. The *exprs converts the list of expressions into
# variable function arguments.
logs_df.agg(*exprs).show()
4、合并,join/union
join连接理解:Spark SQL 之 Join 实现http://sharkdtu.com/posts/spark-sql-join.html
5、新增一列,withColumn('new_name', operation(column_name)),lit-常量
# 标记训练集,mark=0
train = train.withColumn('mark', lit(0))
# 标记测试集,mark=1
test = test.withColumn('mark',lit(1))
print(train.dtypes)
print(train.limit(10).show())
print(test.dtypes)
print(test.limit(10).show())
6、日志分析处理
1】日志提取,正则表达式。
将半结构化的日志数据解析为单独的列。我们将使用专门的内置函数 regexp_extract() 进行解析。此函数将针对具有一个或多个 捕获组 的正则表达式匹配列,并允许提取其中一个匹配的组。我们将对希望提取的每个字段使用一个正则表达式。
希望了解更多关于正则表达式的信息,建议你访问 RegexOne 网站(http://regexone.com/)。你可能还会发现,Goyvaerts 和 Levithan 编写的 《正则表达式手册》 (http://shop.oreilly.com/product/0636920023630.do)是非常有用的参考资料。
from pyspark.sql.functions import regexp_extract
host_pattern = r'(^\S+\.[\S+\.]+\S+)\s'
ts_pattern = r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]'
method_uri_protocol_pattern = r'\"(\S+)\s(\S+)\s*(\S*)\"'
status_pattern = r'\s(\d{3})\s'
logs_df = base_df.select(regexp_extract('value', host_pattern, 1).alias('host'),
regexp_extract('value', ts_pattern, 1).alias('timestamp'),
regexp_extract('value', method_uri_protocol_pattern, 1).alias('method'),
regexp_extract('value', method_uri_protocol_pattern, 2).alias('endpoint'),
regexp_extract('value', method_uri_protocol_pattern, 3).alias('protocol'),
)
logs_df.show(10, truncate=True)
print((logs_df.count(), len(logs_df.columns)))
参考:Apache Spark实现可扩展日志分析https://mp.weixin.qq.com/s?__biz=MzU1NDA4NjU2MA==&mid=2247496339&idx=2&sn=2120b2fdde2295d3bb34919a06ce4fdf&scene=21#wechat_redirect
5. 透视表pivot,可结合groupby。unpivot,spark当中无unpivot操作,使用stack(scala)函数实现,:
# 通过元素为tuple的list创建
data = [(2017, 'dotNet', 150), (2017, 'dotNet', 180), (2017, 'java', 98), (2018, 'java', 120), (2018, 'java', 100)]
spark_df = spark.createDataFrame(data, schema=['year', 'course', 'price'])
spark_df.show()
spark_df.groupby('year').pivot('course', ['dotNet','java']).sum('price').show()
val data = Seq(("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"),
("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"),
("Carrots",1200,"China"),("Beans",1500,"China"),("Orange",4000,"China"),
("Banana",2000,"Canada"),("Carrots",2000,"Canada"),("Beans",2000,"Mexico"))
import spark.sqlContext.implicits._
val df = data.toDF("Product","Amount","Country")
df.show()
# pivot
val pivotDF = df.groupBy("Product","Country")
.sum("Amount")
.groupBy("Product")
.pivot("Country")
.sum("sum(Amount)")
pivotDF.show()
//unpivot
val unPivotDF = pivotDF.select($"Product",
expr("stack(3, 'Canada', Canada, 'China', China, 'Mexico', Mexico) as (Country,Total)"))
.where("Total is not null")
unPivotDF.show()