spark处理完的数据导入mysql
这里用的是loan.csv数据当做例子
需要mysql-connector-java-5.1.49.jar,根据自己的位置而定做出修改哦~
import math
from pyspark import SparkConf
from pyspark.sql import SparkSession
import traceback
import os
from math import sqrt
from pyspark.sql import Row
import pandas as pd
# spark_home 的环境变量
from pyspark.sql.functions import isnull, isnan, meanappname = "test" # 任务名称
master = "local" # 单机模式设置
conf = SparkConf().setAppName(appname).setMaster(master) # spark资源配置
conf = SparkConf().setAppName(appname).setMaster(master) # spark资源配置spark = SparkSession.builder.config(conf=conf)\
.config('spark.driver.extraClassPath', '/usr/local/apache-hive-2.3.9-bin/lib/mysql-connector-java-5.1.49.jar') \
.getOrCreate()# 1.加载数据
data = spark.read.csv("loan_data.csv",header=True,encoding="gbk")
#查看数据的前10条数据
# data.show(10)# 2.查看数据基本情况
#查看数据有多少行多少列
rows = data.count()
cols = len(data.columns)
print("总共有这么多行: ", rows)
print("总共有这么多列: ", cols)# 3.空值处理
# 查看每一列的空值
for col in data.columns:
col_null_count = data.filter(isnull(col)).count()#用filter过滤列
# print(col+"列的空值数量:{}".format(col_null_count))'''
home_ownership列的空值数量:50 使用众数填充
title列的空值数量:527426 控制太多,删除列
dti列的空值数量:2 用均值填充或者删除行
'''
data = data.drop("title")
# 计算home_ownership列的众数
home_ownership_group_count= data.groupBy("home_ownership").count().sort("count",ascending = False).limit(1)home_ownership_group_count = home_ownership_group_count.collect()[0][0]
# print(home_ownership_group_count)
#使用刚才计算出的众数来进行填充
data = data.na.fill(home_ownership_group_count,subset="home_ownership")
#用均值填充dti列的空值
dti_mean = data.filter(~ isnull("dti")).select(mean("dti")).collect()[0][0]
data = data.na.fill(str(dti_mean), subset="dti")# 4.重复值处理
# 4.1检查重复值
# 4.2处理重复值#检查
# data.groupBy('id').count().sort("count",ascending = False).show()"""
id = 36271262
"""
data = data.dropDuplicates(["id"])
# 5、错误值处理:输入错误、量纲不统一、逻辑错误
# data.filter((data.int_rate < 0) & (data.int_rate > 100)).show()
data = data.drop("sub_grade")
data = data.drop("funded_amnt_inv")
# 6、保存到数据库
# url = "jdbc:mysql://192.168.110.128:3306/loan"
# driver = "com.mysql.jdbc.Driver"
# user = "root"
# passwd = "123"
# db = "loan"
# table = "loan_cleaned"
# data.write.format("jdbc").mode("append").options(
# url=url,
# driver=driver,
# user=user,
# password=passwd,
# dbtable=db + "." + table).save()