数据分析之数据整理
1, 数据表合并与空值处理
2, 重复值与异常值处理
3, 增加与修改列
4, 数据采样与描述性统计
#准备 pyspark 环境
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession,Row
#创建应用程序实例和会话
spark = SparkSession.builder\
.master('local[*]')\
.appName('dataMungling')\
.getOrCreate()
sc=spark.sparkContext
#准备数据
#共3个表,分别为:雇员信息,工资,职位
# 1, 雇员信息表, 直接通过并行化python数据创建
employees = sc.parallelize([(1, "John", 25),(2, "Ray", 35),
(3, "Mike", 24), (4, "Jane", 28),
(5, "Kevin", 26), (6, "Vincent", 35),
(7, "James", 38), (8, "Shane", 32),
(9, "Larry", 29), (10, "Kimberly", 29),
(11, "Alex", 28), (12, "Garry", 25),
(13, "Max", 31)])\
.toDF(["emp_id","name","age"])
print(type(employees)) #查看类型, 应该是 DataFrame
employees.collect() #查看数据
<class 'pyspark.sql.dataframe.DataFrame'>
Out[5]:
[Row(emp_id=1, name='John', age=25),
Row(emp_id=2, name='Ray', age=35),
Row(emp_id=3, name='Mike', age=24),
Row(emp_id=4, name='Jane', age=28),
Row(emp_id=5, name='Kevin', age=26),
Row(emp_id=6, name='Vincent', age=35),
Row(emp_id=7, name='James', age=38),
Row(emp_id=8, name='Shane', age=32),
Row(emp_id=9, name='Larry', age=29),
Row(emp_id=10, name='Kimberly', age=29),
Row(emp_id=11, name='Alex', age=28),
Row(emp_id=12, name='Garry', age=25),
Row(emp_id=13, name='Max', age=31)]
# 2,工资表,从 salary.json 文件读取
salary = spark.read.json("file:///work/大数据课件/data/salary.json")
# 3, 职位表,从 designation.json 文件读取
designation = spark.read.json("file:///work/大数据课件/data/designation.json")
# 分别查看两表的前5条数据
salary.show(5)
designation.show(5)
+----+------+
|e_id|salary|
+----+------+
| 1| 10000|
| 2| 12000|
| 3| 12000|
| 4| null|
| 5| 120|
+----+------+
only showing top 5 rows
+---+---------+
| id| role|
+---+---------+
| 1|Associate|
| 2| Manager|
| 3| Manager|
| 4|Associate|
| 5| Manager|
+---+---------+
only showing top 5 rows
数据合并
In [7]:
employees.join(salary,employees.emp_id == salary.e_id).show(5)
+------+-----+---+----+------+
|emp_id| name|age|e_id|salary|
+------+-----+---+----+------+
| 1| John| 25| 1| 10000|
| 2| Ray| 35| 2| 12000|
| 3| Mike| 24| 3| 12000|
| 4| Jane| 28| 4| null|
| 5|Kevin| 26| 5| 120|
+------+-----+---+----+------+
only showing top 5 rows
将 3个表合并
#通过两次join和一次select实现,拼合3表提取必要字段
#第一次join连接雇员信息和工资
#第二次join再连接职位,形成总表
#选取其中的5个核心字段,行程最终数据表
final_data = employees\
.join(salary, employees.emp_id == salary.e_id)\
.join(designation, employees.emp_id == designation.id)\
.select("emp_id", "name", "age", "role", "salary")
#查看最终表的前5条数据
final_data.show(5)
+------+-----+---+---------+------+
|emp_id| name|age| role|salary|
+------+-----+---+---------+------+
| 1| John| 25|Associate| 10000|
| 2| Ray| 35| Manager| 12000|
| 3| Mike| 24| Manager| 12000|
| 4| Jane| 28|Associate| null|
| 5|Kevin| 26| Manager| 120|
+------+-----+---+---------+------+
only showing top 5 rows
空值处理
1, 丢弃
2, 平均值填充
# 1,对空值进行丢弃处理
clean_data = final_data.na.drop()
# 查看原始数据
final_data.show()
# 查看丢弃后的数据
clean_data.show()
+------+--------+---+--------------+------+
|emp_id| name|age| role|salary|
+------+--------+---+--------------+------+
| 1| John| 25| Associate| 10000|
| 2| Ray| 35| Manager| 12000|
| 3| Mike| 24| Manager| 12000|
| 4| Jane| 28| Associate| null|
| 5| Kevin| 26| Manager| 120|
| 6| Vincent| 35|Senior Manager| 22000|
| 7| James| 38|Senior Manager| 20000|
| 8| Shane| 32| Manager| 12000|
| 9| Larry| 29| Manager| 10000|
| 10|Kimberly| 29| Associate| 8000|
| 11| Alex| 28| Manager| 12000|
| 12| Garry| 25| Manager| 12000|
| 13| Max| 31| Manager|120000|
+------+--------+---+--------------+------+
+------+--------+---+--------------+------+
|emp_id| name|age| role|salary|
+------+--------+---+--------------+------+
| 1| John| 25| Associate| 10000|
| 2| Ray| 35| Manager| 12000|
| 3| Mike| 24| Manager| 12000|
| 5| Kevin| 26| Manager| 120|
| 6| Vincent| 35|Senior Manager| 22000|
| 7| James| 38|Senior Manager| 20000|
| 8| Shane| 32| Manager| 12000|
| 9| Larry| 29| Manager| 10000|
| 10|Kimberly| 29| Associate| 8000|
| 11| Alex| 28| Manager| 12000|
| 12| Garry| 25| Manager| 12000|
| 13| Max| 31| Manager|120000|
# 2,对空值进行填充, 使用平均值
# 计算平均值
import math
from pyspark.sql import functions as func
# 使用 SparkSQL 函数计算平均工资, 得到一个查询结果集(DataFrame)
qset = salary.select(func.mean('salary')).collect()
print(qset)
[Row(avg(salary)=20843.333333333332)]
# 取结果集首行,首字段,取整数
mean_salary = math.floor(qset[0][0])
# 显示平均工资
print(mean_salary)
20843
# 用平均值填充空值
clean_data = final_data.na.fill({"salary" : mean_salary})
# 显示填充后的数据
clean_data.show()
+------+--------+---+--------------+------+
|emp_id| name|age| role|salary|
+------+--------+---+--------------+------+
| 1| John| 25| Associate| 10000|
| 2| Ray| 35| Manager| 12000|
| 3| Mike| 24| Manager| 12000|
| 4| Jane| 28| Associate| 20843|
| 5| Kevin| 26| Manager| 120|
| 6| Vincent| 35|Senior Manager| 22000|
| 7| James| 38|Senior Manager| 20000|
| 8| Shane| 32| Manager| 12000|
| 9| Larry| 29| Manager| 10000|
| 10|Kimberly| 29| Associate| 8000|
| 11| Alex| 28| Manager| 12000|
| 12| Garry| 25| Manager| 12000|
| 13| Max| 31| Manager|120000|
+------+--------+---+--------------+------+
更加详细的空值处理
1, 一般丢弃
2, 指定条件丢弃
3, 一般填充
4, 按列填充
authors = [['Thomas','Hardy','June 2, 1840'],
['Charles','Dickens','7 February 1812'],
['Mark','Twain',None],
['Jane','Austen','16 December 1775'],
['Emily',None,None]]
df1 = sc.parallelize(authors).toDF(["FirstName","LastName","Dob"])
df1.show()
+---------+--------+----------------+
|FirstName|LastName| Dob|
+---------+--------+----------------+
| Thomas| Hardy| June 2, 1840|
| Charles| Dickens| 7 February 1812|
| Mark| Twain| null|
| Jane| Austen|16 December 1775|
| Emily| null| null|
+---------+--------+----------------+
# 1,一般丢弃
df1.na.drop().show()
+---------+--------+----------------+
|FirstName|LastName| Dob|
+---------+--------+----------------+
| Thomas| Hardy| June 2, 1840|
| Charles| Dickens| 7 February 1812|
| Jane| Austen|16 December 1775|
+---------+--------+----------------+
# 2,指定条件丢弃
df1.na.drop(thresh=2).show()
+---------+--------+----------------+
|FirstName|LastName| Dob|
+---------+--------+----------------+
| Thomas| Hardy| June 2, 1840|
| Charles| Dickens| 7 February 1812|
| Mark| Twain| null|
| Jane| Austen|16 December 1775|
+---------+--------+----------------+
# 3,一般填充
df1.na.fill('Unknown').show()
+---------+--------+----------------+
|FirstName|LastName| Dob|
+---------+--------+----------------+
| Thomas| Hardy| June 2, 1840|
| Charles| Dickens| 7 February 1812|
| Mark| Twain| Unknown|
| Jane| Austen|16 December 1775|
| Emily| Unknown| Unknown|
+---------+--------+----------------+
#按列填充 刘泉鸣 留全名
df1.na.fill({'LastName':'--','Dob':'1 Jan 1970'}).show()
#按列填充 刘泉鸣 留全名
df1.na.fill({'LastName':'--','Dob':'1 Jan 1970'}).show()
+---------+--------+----------------+
|FirstName|LastName| Dob|
+---------+--------+----------------+
| Thomas| Hardy| June 2, 1840|
| Charles| Dickens| 7 February 1812|
| Mark| Twain| 1 Jan 1970|
| Jane| Austen|16 December 1775|
| Emily| --| 1 Jan 1970|
+---------+--------+----------------+
异常值判定和处理
In [36]:
print(mean_salary)
11830
# 异常值
# 标准差
devs = final_data.select(((final_data.salary - mean_salary) ** 2).alias("deviation"))#每个数据与平均值求差做平方
#devs.show()
#devs.select()
stddev = math.floor(math.sqrt(devs.groupBy().avg("deviation").first()[0]))#方差平均值开方取整
#得到均方差(标准差)
print(stddev)
30351
#2个标准差以外的用均值替换
no_outlier = final_data.select(\
final_data.emp_id,\
final_data.name,\
final_data.age,\
final_data.salary,\
final_data.role,\
func.\
when(final_data.salary.\
between(mean_salary-(stddev/2), mean_salary+(stddev/2)),\
final_data.salary).\
otherwise(mean_salary).\
alias("updated_salary")
)
no_outlier.show()
+------+--------+---+------+--------------+--------------+
|emp_id| name|age|salary| role|updated_salary|
+------+--------+---+------+--------------+--------------+
| 1| John| 25| 10000| Associate| 10000|
| 2| Ray| 35| 12000| Manager| 12000|
| 3| Mike| 24| 12000| Manager| 12000|
| 4| Jane| 28| null| Associate| 20843|
| 5| Kevin| 26| 120| Manager| 20843|
| 6| Vincent| 35| 22000|Senior Manager| 22000|
| 7| James| 38| 20000|Senior Manager| 20000|
| 8| Shane| 32| 12000| Manager| 12000|
| 9| Larry| 29| 10000| Manager| 10000|
| 10|Kimberly| 29| 8000| Associate| 8000|
| 11| Alex| 28| 12000| Manager| 12000|
| 12| Garry| 25| 12000| Manager| 12000|
| 13| Max| 31|120000| Manager| 20843|
m_salary = math.floor(no_outlier.select(func.mean('updated_salary'))\
.collect()[0][0])
print(m_salary)
df2 = no_outlier.select(
((no_outlier.updated_salary-m_salary)**2).alias('devt'))
nstddev = math.sqrt(df2.groupBy().avg('devt').first()[0])
print(nstddev)
14809
4961.019420674313
# 查看受影响的行
no_outlier.filter(no_outlier.salary != no_outlier.updated_salary).show()
+------+-----+---+------+-------+--------------+
|emp_id| name|age|salary| role|updated_salary|
+------+-----+---+------+-------+--------------+
| 5|Kevin| 26| 120|Manager| 20843|
| 13| Max| 31|120000|Manager| 20843|
+------+-----+---+------+-------+--------------+
重复值的处理
# 准备数据
authors = [['Thomas','Hardy','June 2,1840'],
['Thomas','Hardy','June 2,1840'],
['Thomas','H',None],
['Jane','Austen','16 December 1775'],
['Emily',None,'June 2,1840']]
df1 = sc.parallelize(authors).toDF(["FirstName","LastName","Dob"])
df1.show()
+---------+--------+----------------+
|FirstName|LastName| Dob|
+---------+--------+----------------+
| Thomas| Hardy| June 2,1840|
| Thomas| Hardy| June 2,1840|
| Thomas| H| null|
| Jane| Austen|16 December 1775|
| Emily| null| June 2,1840|
+---------+--------+----------------+
# 一般去重复
df1.dropDuplicates().show()
+---------+--------+----------------+
|FirstName|LastName| Dob|
+---------+--------+----------------+
| Jane| Austen|16 December 1775|
| Emily| null| null|
| Thomas| Hardy| June 2,1840|
| Thomas| H| null|
+---------+--------+----------------+
# 基于列去重复
df1.dropDuplicates(subset=["FirstName"])\
.dropDuplicates(subset=["Dob"])\
.show()
+---------+--------+----------------+
|FirstName|LastName| Dob|
+---------+--------+----------------+
| Emily| null| June 2,1840|
| Jane| Austen|16 December 1775|
+---------+--------+----------------+
数据列处理
# 连接函数 udf
concat_func = func.udf(lambda name, age: name + "_" + str(age))
concat_df = final_data.withColumn("name_age", concat_func(final_data.name, final_data.age))
concat_df.show()
+------+--------+---+--------------+------+-----------+
|emp_id| name|age| role|salary| name_age|
+------+--------+---+--------------+------+-----------+
| 1| John| 25| Associate| 10000| John_25|
| 2| Ray| 35| Manager| 12000| Ray_35|
| 3| Mike| 24| Manager| 12000| Mike_24|
| 4| Jane| 28| Associate| null| Jane_28|
| 5| Kevin| 26| Manager| 120| Kevin_26|
| 6| Vincent| 35|Senior Manager| 22000| Vincent_35|
| 7| James| 38|Senior Manager| 20000| James_38|
| 8| Shane| 32| Manager| 12000| Shane_32|
| 9| Larry| 29| Manager| 10000| Larry_29|
| 10|Kimberly| 29| Associate| 8000|Kimberly_29|
| 11| Alex| 28| Manager| 12000| Alex_28|
| 12| Garry| 25| Manager| 12000| Garry_25|
| 13| Max| 31| Manager|120000| Max_31|
+------+--------+---+--------------+------+-----------+
data_new = concat_df.withColumn("age_incremented",concat_df.age + 10)
data_new.select('emp_id','age','age_incremented').show()
+------+---+---------------+
|emp_id|age|age_incremented|
+------+---+---------------+
| 1| 25| 35|
| 2| 35| 45|
| 3| 24| 34|
| 4| 28| 38|
| 5| 26| 36|
| 6| 35| 45|
| 7| 38| 48|
| 8| 32| 42|
| 9| 29| 39|
| 10| 29| 39|
| 11| 28| 38|
| 12| 25| 35|
| 13| 31| 41|
+------+---+---------------+
df1.replace('Emily','Charlotte','FirstName').show()
'''
update df1 set FirstName = 'Charlotte' where FirstName = 'Emily'
'''
+---------+--------+----------------+
|FirstName|LastName| Dob|
+---------+--------+----------------+
| Thomas| Hardy| June 2,1840|
| Thomas| Hardy| June 2,1840|
| Thomas| H| null|
| Jane| Austen|16 December 1775|
|Charlotte| null| June 2,1840|
+---------+--------+----------------+