Python数据分析代码

1.1导入第三方库

pip install numpy -i https://pypi.tuna.tsinghua.edu.cn/simple

pip install pandas -i https://pypi.tuna.tsinghua.edu.cn/simple


1.2

numpy数组

import numpy as np
#ndarray数组对象的创建
#第一种办法:列表、元组
n1=np.array([1,2,3,4])
print(n1)
print(type(n1))
n2=np.array((2,3,4.5,6))
print(n2)
print(type(n2))
n3=np.array([[1,2,3,4],[5,6,7,8],[2,1,5,10],(5,7,8,9)])
print(n3)
print(type(n3))
#第二种办法函数
n4=np.arange(10)
print(n4)
print(type(n4))
n5=np.ones((3,3,4))
print(n5)
print(type(n5))
n6=np.zeros((5,3,4))
print(n6)
print(type(n6))
n7=np.full((3,3,4),5)
print(n7)
print(type(n7))
n8=np.eye(5)
print(n8)
print(type(n8))
#属性
print(n4.shape)
print(n4.size)
print(n4.dtype)
#数组的转换
#形状的转换
n9=np.arange(100).reshape(2,5,10)
print(n9)
n10=n9.flatten()
print(n10)
#数据类型的改变
n11=np.ones((2,3,4),dtype=np.int32)
print(n11)
n12=n11.astype(dtype=np.float32)
print(n12)
n13=np.full((2,3,4),25)
print(n13)
n14=n13.tolist()
print(n14)
print(type(n14))

1.3一维数组,​​​​​​二维数组

import numpy as np
import random
#一维数组
a1=np.arange(10)
# print(a1)
# print(a1[1])
# print(a1[2:5])
 
#二维数组
a2=np.arange(24).reshape(2,3,4)
# print(a2)
# print(a2[1,2:3])
 
#astype()方法创建新的数组(原始数组的拷贝)
a3=a2.astype(np.float_)
# print(a3)
 
#数组向列表的转换 tolist
a4=a3.tolist()
# print(a4)
 
#随机数
b1=np.random.rand(3,10,2)
# print(b1)
np.random.seed(20)
b2=np.random.randint(1,20,10)
# print(b2)
 
b3=np.random.randint(1,20,(4,4))
# print(b3)
 
b4=np.random.random(100)
# print(b4)
 
b5=np.random.randn(10,5)
# print(b5)
 
#排序
# b2.sort()
# print(b2)
# b3.sort(axis=1) #行
# print(b3)
#
# b3.sort(axis=0) #列
# print(b3)
 
#去重和重复
list1=[1,2,1,1,1,5,4,6,4,8,7,8,7,2,5,6,1,8,2,1,5,4,8,9]
c1=np.array(list1)
print(c1)
d1=np.unique(c1)
print(d1)
 
c2=np.arange(4)
print(c2)
d2=np.tile(c2,3)
print(d2)
 
c3=np.arange(10).reshape(2,5)
# print(c3)
 
d3=c3.repeat(2,axis=0)
print(d3)
 
d4=c3.repeat(2,axis=0)
print(d4)
 
#统计函数
 
 

1.4 pandas——创建

import numpy as np
import pandas as nd
import xlrd
#Series创建——列表
a1=nd.Series([1,2,3,4],index=['x','b','c','f'])
print(a1)
print(type(a1))
#Series创建——标量值
a2=nd.Series(25,index=['a','x','y'])
print(a2)
#Series创建——字典
a3=nd.Series({'a':1,'b':2,'c':3},index=('a','x','y','z'))
print(a3)
#Series创建——ndarray
a4=nd.Series(np.arange(4),index=['x','b','c','f'])
print(a4)
a5=nd.Series(np.arange(5),index=np.arange(9,4,-1))
print(a5)
 
#
a6=nd.DataFrame(np.arange(20).reshape(4,5))
print(a6)
 
a7=nd.DataFrame({'name':['zhangsan','lisi','wangwu'],'age':[17,18,19]},index=('a','b','c'))
print(a7)
 
dt={'one':nd.Series([1,2,3],index=['a','b','c']),'two':nd.Series([4,5,6],index=['a','b','c'])}
a8=nd.DataFrame(dt)
print(a8)
a8=nd.DataFrame({'城市':['北京','上海','广州','深圳','沈阳'],
                 '环比':[101.5,101.2,101.3,102.0,100.1],
                 '同比':[120.7,127.3,119.4,140.9,101.4],
                 '定基':[121.4,127.8,120.0,145.5,101.6]},
                index=('','','','',''))
print(a8)
 
a10=nd.read_csv('11.csv')
print(a10)
 
a11=nd.DataFrame({'id':[1001,1002,1003,1004,1005,1006],
                 'data':['2018-01-01','2018-01-02','2018-01-03','2018-01-04','2018-01-05','2018-01-06'],
                 'city':['beijing','shagnhai','guangzhou','shenzhen','qingdao','wuhan'],
                 'age':[17,18,19,20,18,19],
                  'category':['100-A','101-B','102-C','103-D','104-E','105-F']})
                # index=('','','','',''))
print(a11)
 
print(a11['city'])
print(a11.loc[: ,"city"])
print(a11.iloc[:,3])
 
# a9=nd.read_table('1.txt')
# print(a9)
 
 
a13=nd.read_excel('11.xls')
print(a13)
 
 
 

1.5pandas--增删查改

import numpy as np
import pandas as nd
a=nd.DataFrame({'id':[1001,1002,1003,1004,1005,1006],
                 'data':['2018-01-01','2018-01-02','2018-01-03','2018-01-04','2018-01-05','2018-01-06'],
                 'city':['beijing','shagnhai','guangzhou','shenzhen','qingdao','wuhan'],
                 'age':[17,18,19,20,18,19],
                  'category':['100-A','101-B','102-C','103-D','104-E','105-F']})
print(a)
#选择一列
print(a['city'])
print(a.loc[: ,"city"])
print(a.iloc[:,3])
#选择多列
print(a[['id','city','age']])
print(a.loc[: ,['id','city','age']])
print(a.iloc[:,[0,2,3]])
#插入数据
a['score']=[450,460,480,411,555,222]
print(a)
a['age']=a['age']+2
print(a['age'])
import pandas as pd
import numpy as np
# #二维
# y1=pd.DataFrame(np.arange(10).reshape(2,5))
# print(y1)
# print(type(y1))
# #字典
# y2=pd.DataFrame({'id':[1,2,3],'name':['zhagnsai','kusu','wangwu'],'age':[20,18,19]})
# print(y2)
# #Series
# y3=pd.Series([3,1,5,7,8])
# print(y3)
#
# y4=pd.DataFrame([[1,2],[3,4]],columns=['a','b'],index=[1,2]) #index 行索引 columns 列索引
# print(y4)
#
# y5=pd.DataFrame(np.arange(2),columns=['a'],index=[1,2])
# print(y5)
 
a11 = pd.DataFrame({'id': [1001, 1002, 1003, 1004, 1005, 1006],
                    'data':pd.date_range('20180101',periods=6),
                    'city': ['beijing', 'shagnhai', 'guangzhou', 'shenzhen', 'qingdao', 'wuhan'],
                    'age': [17, 48, 91, 20, 18, 19],
                    'category': ['100-A', '101-B', '102-C', '103-D', '104-E', '105-F']})
 
print(a11.to_string(index=True))
#选择一行
x1=a11['id']
print(x1)
 
x2=a11.loc[:,['id']]
print(x2)
 
x3=a11.iloc[:,0]
print(x3)
#选择一行
x4=a11[['id','city']]
print(x4)
 
x5=a11.loc[:,['id','city']]
print(x5)
 
x6=a11.iloc[:,[0,2]]
print(x6)
#插入
a11['corre']=[255,511,444,355,444,999]
print(a11)
 
a11['group']=''
print(a11)
 
a11.loc[a11['age']>40,'group']='elder'
a11.loc[a11['age']<40,'group']='younger'
print(a11)
 
a11['age']=a11['age']+2
print(a11)
#删除
# del a11['age']
# print(a11)
 
# a11.drop(['group'],axis=1,inplace=True)
# print(a11)
 
# a11.drop(a11.columns[0],axis=1,inplace=True)
# print(a11)
 
a11.drop(a11.columns[0:3],axis=1,inplace=False)
print(a11)
import numpy as np
import pandas as pd
y1=pd.DataFrame(
    {'id':['1001','1002','1003','1004','1005','1006'],
    'data':pd.date_range('20180101',periods=6),
    'city':['beijing','shagnhai','guangzhou','chenzheng','wuhan','qingdao'],
    'age':[17,42,19,20,18,41],
    'cetegory':['100-A','100-B','100-C','100-D','100-E','100-F']},
    index=('a','b','c','d','e','f')) #行索引
print(y1)
 
 
# 一行
print(y1[0:1])  # 打印第一行数据
print(y1.iloc[0])  # 使用 iloc 方法打印第一行数据
print(y1.loc['a'])  # 使用 loc 方法打印索引为 'a' 的行数据
 
# 多行
print(y1[0:3])  # 打印前三行数据
print(y1.iloc[0:3])  # 使用 iloc 方法打印前三行数据
print(y1.loc[['a','b']])  # 使用 loc 方法打印索引为 'a' 和 'b' 的行数据
 
# 条件
print(y1[y1['id']=='1004'])  # 打印满足条件 id 为 '1004' 的行数据
 
# 新行
y1.loc['g']=['1007','2018-01-07','wuhan','20','108-F']  # 在索引 'g' 处新增一行数据
print(y1)
 
# 删除
x=y1.drop(y1.index[0:1])  # 删除索引为 0 的行
print(x)
y=y1.drop(y1.index[0:2])  # 删除索引为 0 和 1 的行
print(y)
 
# 切片 一行
print(y1[0:3])  # 打印前三行数据
print(y1.iloc[0:3])  # 使用 iloc 方法打印前三行数据
 
# 一列
print(y1[['id','city']])  # 打印'id'和'city'列数据
print(y1[y1.columns[0:3]])  # 打印前三列数据
 
# 局部切片
print(y1[0:3][['id','age']])  # 打印前三行中'id'和'age'列数据

1.6排序

import numpy as np
import pandas as pd
 
# 创建一个 DataFrame 对象
x = pd.DataFrame(np.arange(20).reshape(4,5), columns=['a','b','c','d','e'], index=[1,2,3,4])
print(x)
 
# 按列名排序(升序)axis为1 列
w = x.sort_index(axis=1, ascending=True)
print(w)
 
# 按列名排序(降序)
y = x.sort_index(axis=1, ascending=False)
print(y)
 
# 按行索引排序(升序)axis为0 行
z = x.sort_index(axis=0)
print(z)
 
# 按行索引排序(降序)
print(x.sort_index(axis=0, ascending=False))
 
 
print(x.sort_values(by=['a','b']))
print(x.sort_values(by=['a','b'],ascending=False))
 
 
# 按照列'a'和'b'的值对DataFrame进行升序排序,并打印结果
print(x.sort_values(by=['a','b']))
 
# 按照列'a'和'b'的值对DataFrame进行降序排序,并打印结果
print(x.sort_values(by=['a','b'], ascending=False))
 
# 使用NumPy生成一个5x5的随机整数数组
np.random.seed(10)
score = np.random.randint(40, 500, (5, 5))
 
# 科目列表
sub = ['语', '数', '英', '政', '体']
 
# 学生列表
stu = ["同学" + str(i) for i in range(5)]
 
# 创建DataFrame,列名为科目,行名为学生,内容为随机整数数组
data = pd.DataFrame(score, columns=sub, index=stu)
 
# 打印生成的DataFrame
print(data)
 
# 按照'语'这一列的值对DataFrame进行降序排序,并打印结果
print(data.sort_values(by=['语'], ascending=False))
 
# 按照行索引(学生姓名)进行降序排序,并打印结果
print(data.sort_index(axis=0, ascending=False))

1.7分析DataFrame数据

import pandas as pd
 
# 从 "musicdata.csv" 文件中读取数据,并将其存储到名为mu的DataFrame对象中
mu=pd.read_csv("musicdata.csv")
 
# 打印整个DataFrame对象mu,以便查看数据的内容
print(mu)
 
# 计算并打印DataFrame中名为'value_actual'的列的统计摘要信息,包括平均值、标准差、最小值、最大值和四分位数
print(mu['value_actual'].describe())
 
# 计算并打印DataFrame中名为'format'的列中每个唯一值的出现次数,然后选取前6个出现次数最多的值及其出现次数
print(mu['format'].value_counts()[0:6])
 
# 打印DataFrame的形状,即行数和列数
print(mu.shape)
 
# 打印DataFrame的前10行,以便快速查看数据的一部分
print(mu.head(10))
 
# 打印DataFrame的最后10行,以便快速查看数据的一部分
print(mu.tail(10))

1.8 实训

1.1

import pandas as pd
 
mu = pd.read_csv("某地区房屋销售数据.csv", encoding='gbk')
print(mu)
# 打印DataFrame的形状,即行数和列数
print("Data shape:", mu.shape)
 
# 打印DataFrame的列名
print("Column names:", mu.columns)
 
# 查看数据的维度数量
print(mu.ndim)
print(mu.ndim)  # 查看数据的维度数量
print(mu.index)  # 查看数据的索引
print(mu.iloc[0:1])  # 使用iloc按位置选择数据的子集,这里选择第一行数据
print(mu.loc[1])  # 使用loc按标签选择数据的子集,这里选择标签为1的数据
print(mu.loc[:,['房屋价格','配套房间数']])  # 使用loc选择特定列的数据,这里选择'房屋价格'和'配套房间数'列
print(mu.iloc[:,[1,3]])  # 使用iloc选择特定列的数据,这里选择索引为1和3的列
print(mu[0:3][['房屋价格','配套房间数']])  # 选择前3行,并且只显示'房屋价格'和'配套房间数'列
 

1.2

import pandas as pd
 
# 从 "musicdata.csv" 文件中读取数据,并将其存储到名为mu的DataFrame对象中
mu=pd.read_csv("musicdata.csv")
 
# 打印整个DataFrame对象mu,以便查看数据的内容
print(mu)
 
# 计算并打印DataFrame中名为'value_actual'的列的统计摘要信息,包括平均值、标准差、最小值、最大值和四分位数
print(mu['value_actual'].describe())
 
# 计算并打印DataFrame中名为'format'的列中每个唯一值的出现次数,然后选取前6个出现次数最多的值及其出现次数
print(mu['format'].value_counts()[0:6])
 
# 打印DataFrame的形状,即行数和列数
print(mu.shape)
 
# 打印DataFrame的前10行,以便快速查看数据的一部分
print(mu.head(10))
 
# 打印DataFrame的最后10行,以便快速查看数据的一部分
print(mu.tail(10))

1.3

import pandas as pd
import numpy as np
# 创建一个字典,包含'grammer'和'popularity'两列的数据
data = {'grammer': ['pythone', 'java', 'go', 'NaN', 'python', 'c', 'c++'],
        'popularity': [1, np.nan, np.nan, 4, 5, 7, 8]}
 
# 将字典转换为DataFrame
df = pd.DataFrame(data)
 
# 打印DataFrame,显示数据
print(df)
 
# 将DataFrame保存为CSV文件
df.to_csv('data1.csv')
 
# 打印DataFrame的形状(行数和列数)
print(df.shape)
 
# 打印DataFrame中'grammer'列的数据
print(df['grammer'])
 
# 打印DataFrame的最后3行数据
print(df.tail(3))
 
# 删除DataFrame中最后一行
df.drop([df.shape[0]-1], inplace=True)
 
# 打印删除最后一行后的DataFrame
print(df)
 
# 在DataFrame中添加一行数据
df.loc['6'] = ['php', 6.6]
 
# 打印添加新行后的DataFrame
print(df)
 
# 按'popularity'列对DataFrame进行排序,降序排列
x = df.sort_values(by=['popularity'], ascending=False)
 
# 打印排序后的DataFrame
print(x)

1.9 Matplotlib

1.1

import matplotlib.pyplot as plt
plt.figure(figsize=(10,8))
plt.subplot(2,2,1)
plt.subplot(2,2,2)
plt.show()

1.2

import matplotlib.pyplot as plt
import numpy as np
from pylab import mpl
mpl.rcParams['font.sans-serif']=['SimHei']
mpl.rcParams['axes.unicode_minus']=False
data=np.arange(0,1,0.01)
plt.title("曲线图")
plt.xlabel("x轴")
plt.xlim(0,1)
plt.xticks([0,0.2,0.4,0.6,0.8,1])
plt.ylabel('y轴')
plt.ylim((0,1))
plt.yticks([0,0.2,0.4,0.6,0.8,1])
plt.plot(data,data**2)
plt.plot(data,data**4)
plt.legend(['y=x^2','y=x^4'])
plt.show()

1.3

import pandas as pd
import matplotlib.pyplot as plt
data=pd.read_csv('HousePrice.csv')
print('data')
# plt.subplot(2,1,1)
# data['price'].hist(bins=20)
# plt.subplot(2,1,2)
# data['price'].hist(bins=50)
# data[0:8105]['price'].hist(bins=50,stacked=True,alpha=0.5)
# data[8105:16211]['price'].hist(bins=50,stacked=True)
# data['dist'].value_counts().plot(kind='bar')
data['dist'].value_counts().plot(kind='barh')
plt.show()

 2.0箱线图,折线图,饼图

import matplotlib.pyplot as plt
import pandas as pd
#箱线图
# 从名为'HousePrice.csv'的CSV文件中读取数据,并存储在名为data1的DataFrame对象中
data1 = pd.read_csv('HousePrice.csv')
# # 创建箱线图,其中'price'列是要绘制箱线图的数据,'dist'列是要分组的列,notch=True表示要显示箱线图的缺口,vert=True表示要垂直绘制箱线图
# data1.boxplot(column='price', by=['dist'], notch=True, vert=True)
# # 显示绘制的箱线图
# plt.show()
 
#饼图
# 定义数据集
# data2 = [215, 130, 245, 210]
# vabels = ['A', 'B', 'C', 'D']  # 定义数据标签
# # 创建一个新的图形,并设置图形大小为5x5
# plt.figure(figsize=(5, 5))
# # 绘制饼图,传入数据集、数据标签、自动计算百分比格式、饼图部分偏移量
# plt.pie(data2, labels=vabels, autopct='%0.2f%%', explode=[0, 0, 0.1, 0])
# # 显示绘制的饼图
# plt.show()
 
# # 创建饼图,其中数据来源于地区('dist'列)出现的次数,标签为地区名称,百分比标签保留两位小数,第五个扇形突出显示
# plt.pie(x=data1['dist'].value_counts(), labels=data1['dist'].value_counts().index, autopct='%0.2f%%', explode=[0, 0, 0, 0, 0.1, 0])
# # 显示绘制的饼图
# plt.show()
 
#散点图
# 使用 plt.scatter() 函数绘制散点图,传入 x 和 y 轴的数据
# data1['AREA'] 是 x 轴数据,表示地区面积
# data1['price']
# 是 y 轴数据,表示房屋价格
 
# 表示使用方块作为散点的标记
marker='s'
plt.scatter(data1['AREA'], data1['price'], marker='s')
# # 显示绘制的散点图
plt.show()
 
#折线图
# # 定义 x 轴数据,表示时间序列
# x1 = ['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06',
#       '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12']
# # 定义 y 轴数据,表示体重数据
# y1 = [96, 85, 84, 80, 75, 70, 70, 74, 78, 70, 74, 80]
# # 创建一个新的图形,并设置图形大小为10x7
# plt.figure(figsize=(10, 7))
# # 设置字体样式
# font1 = {'family': 'Times New Roman', 'weight': 'normal', 'size': 14}
# plt.rc('font', **font1)
# # 绘制折线图
# plt.plot(x1, y1, label='weight changes', linewidth=3, color='r', marker='o',
#          markerfacecolor='blue', markersize=14)
# # 设置图形标题和轴标签
# plt.title("my weight", fontproperties=font1)  # 设置标题
# plt.xlabel('month', fontproperties=font1)  # 设置 x 轴标签
# plt.ylabel('weight', fontproperties=font1)  # 设置 y 轴标签
# # 显示图例
# plt.legend()
# # 显示绘制的折线图
# plt.show()

2.1练习

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
data1 = pd.read_csv('flights.csv')
#第一题
# plt.plot(data1['year'].value_counts().index,data1.groupby('year')['passengers'].sum())
# plt.show()
#第二题
# plt.bar(data1[data1['year']==1950]['month'],data1[data1['year']==1950]['passengers'])
# plt.show()
 
# data2=data1[['month','passengers']].groupby(['month']).sum()
# data3=pd.Series(data2['passengers'])
# plt.bar(data2.index,data3.values)
# plt.show()
 
#第三题
data4= pd.read_csv('iris.csv')
plt.scatter(data4['sepal_length']*data4['sepal_width'],data4['petal_length']*data4['petal_width'])
plt.show()
 
#第四题
import matplotlib.pyplot as plt
import pandas as pd
 
# 加载数据
data = pd.read_csv('iris.csv')
 
# 提取 sepal_length、sepal_width、petal_length 和 petal_width 列的数据
sepal_length = data['sepal_length']
sepal_width = data['sepal_width']
petal_length = data['petal_length']
petal_width = data['petal_width']
 
# 绘制散点图
plt.scatter(sepal_length, sepal_width, label='Sepal')
plt.scatter(petal_length, petal_width, label='Petal')
 
# 添加标题和标签
plt.title('Sepal and Petal Size Relationship')
plt.xlabel('Length (cm)')
plt.ylabel('Width (cm)')
plt.legend()
 
# 显示图形
plt.show()
 
import matplotlib.pyplot as plt
import pandas as pd
 
# 加载数据
data = pd.read_csv('iris.csv')
 
# 提取不同种类的数据
setosa = data[data['species'] == 'setosa']
versicolor = data[data['species'] == 'versicolor']
virginica = data[data['species'] == 'virginica']
 
# 创建一个包含两个子图的画布
fig, axs = plt.subplots(1, 2, figsize=(12, 6))
 
# 在第一个子图中绘制 sepal 的大小关系
axs[0].scatter(setosa['sepal_length'], setosa['sepal_width'], label='Setosa', marker='o')
axs[0].scatter(versicolor['sepal_length'], versicolor['sepal_width'], label='Versicolor', marker='s')
axs[0].scatter(virginica['sepal_length'], virginica['sepal_width'], label='Virginica', marker='^')
axs[0].set_title('Sepal Size Relationship')
axs[0].set_xlabel('Sepal Length (cm)')
axs[0].set_ylabel('Sepal Width (cm)')
axs[0].legend()
 
# 在第二个子图中绘制 petal 的大小关系
axs[1].scatter(setosa['petal_length'], setosa['petal_width'], label='Setosa', marker='o')
axs[1].scatter(versicolor['petal_length'], versicolor['petal_width'], label='Versicolor', marker='s')
axs[1].scatter(virginica['petal_length'], virginica['petal_width'], label='Virginica', marker='^')
axs[1].set_title('Petal Size Relationship')
axs[1].set_xlabel('Petal Length (cm)')
axs[1].set_ylabel('Petal Width (cm)')
axs[1].legend()
 
# 调整子图之间的间距
plt.tight_layout()
 
# 显示图形
plt.show()

2.2数据清洗

import pandas as pd
data=pd.read_csv('missing_data.csv')
import  xlrd
#数值型
# print(data)
# print(data.fillna(data.mean())) #均值
# print(data.fillna(data.median()))   #中位
# print(data.fillna(data.mode())) #众数
# print(data.fillna(data.mode().iloc[0])) #众数
#类别型
# data2=pd.read_excel('hr.xls')
# print(data2)
# data2['性别'].fillna(data2['性别'].mode()[0],inplace=True)
# print(data2)
#120练习
data3=pd.read_csv('hr_job.csv')
print(data3)
# data3['num'].fillna(data3['num'].mode()[0],inplace=True)
# print(data3)
# data3['gender'].fillna('未知',inplace=True)
# print(data3)
# data3['num'].fillna(data3['num'].mean(),inplace=True)
# print(data3)
print(data3.fillna(data3.mode().iloc[0]))
 
#删除法
# dis1={'id':list(range(1,10)),'cpu':['i5','i7','i3',np.nan,np.nan,'i3','i9',np.nan,'i3']}
# a=pd.DataFrame(dis1)
# print(a)
# print(a.dropna())
 
#查看数据情况
# data = pd.read_excel('meal_order_detail.xls')
# # 打印数据的描述性统计信息,包括计数、均值、标准差、最小值、25%、50%、75%以及最大值等
# print(data.describe())
# # 打印数据中每个单元格是否为缺失值的布尔值(True表示缺失值,False表示非缺失值)
# print(data.isnull())
# # 打印每列(特征)中缺失值的数量
# print(data.isnull().sum())

2.3数据清洗-缺失值

import numpy as np
import pandas as pd
from scipy.interpolate import interp1d
from scipy.interpolate import lagrange
#线性插值
x=[1,2,3,4,5]
y=[2,3,5,8,10]
# f=interp1d(x,y)
# f1=lagrange(x,y)
# new_x=[1.5,2.5,3.5,4.5]
# new_y=f(new_x)
# new_y=f1(new_x)
# print(new_y)
#多项式插值
x=np.array([1,2,3,4,5,8,9,10])
y1=np.array([2,8,18,32,50,128,162,200])
y2=np.array([3,5,7,9,11,17,19,21])
data=interp1d(x,y1,kind='linear')
data2=interp1d(x,y2,kind='linear')
print(data([6,7]))
print(data2([6,7]))
 
ldata1=lagrange(x,y1)
ldata2=lagrange(x,y2)
print(ldata1([6,7]))
print(ldata2([6,7]))
#样条插值
x=[1,2,3,4,5]
y=[2,3,5,8,10]
f3=interp1d(x,y,kind='cubic')
new_x=[1.5,2.5,3.5,4.5]
new_y=f3(new_x)import pandas as pd
import numpy as np
import xlrd
import matplotlib.pyplot as plt
data=pd.read_csv('hr_job.csv')
print(data)
y=data['num'].mean()
print(y)
 
data['num'].fillna(data['num'].mean(),inplace=True)
print(data)
 
 
data['num'].fillna(data['num'].mean())
print(data)
#2
data['num']=data['num'].fillna(data['num'].mean()).astype(int)
print(data)
 
data.loc[data['num']>y,'num']=y
print(data)
 
data.fillna('未知',inplace=True)
print(data)
 
# a_num=(data['num']<=y)
# print(data.loc[a_num])
 
 
# 读取数据
# data = pd.read_csv('hr_job.csv')
# print(data)
#
# # 选择只包含数值型数据的列
# numeric_data = data.select_dtypes(include='number')
#
# # 计算每个特征的均值
# mean_values = numeric_data.mean()
#
# # 标识数值型异常数据的布尔索引
# outlier_index = (numeric_data > 3 * mean_values) | (numeric_data < 0.3 * mean_values)
#
# # 逐列替换异常值为对应特征的均值
# for column in numeric_data.columns:
#     numeric_data.loc[outlier_index[column], column] = mean_values[column]
#
# # 将处理后的数值型数据重新放回原始数据中
# data[numeric_data.columns] = numeric_data
#
# print("替换异常值后的数据:")
# print(data)
 
 
 
 
 
 
 
# data['num'].fillna(data['num'].mean(),inplace=True)
# print(data)
# data['gender'].fillna(data['gender'].mode()[0],inplace=True)
# print(data)
# p=plt.boxplot(data['num'])
# plt.show()
# out=p['fliers'][0].get_ydata()
# print(len(out))
# print(map(out))
# print(min(out))
 
 
 
# np.random.seed(10)
# data=np.random.randint(0,1000,size=(10,6))
# df=pd.DataFrame(data,columns=['A','B','C','D','E','F'])
# print(df)
# a_mean=df['D'].mean()*1.5
# print(a_mean)
# #判断异常值k
# print(df['D']>a_mean)
# #过滤异常值
# k=(df["D"]<=a_mean)
# print(df.loc[k])
 
# data1=pd.read_excel('stu.xls')
# print(data1)
 
# x=data1['age'].mean()
# print(x)
# k=(data1['age']<=x)
# print(data1.loc[k])
 
print(new_y)
 

2.4数据清洗-去重

import pandas as pd
data=pd.read_csv('detail.csv',encoding='gbk')
print(data)
print(data['dishes_name'].drop_duplicates())

2.5变换数据

import pandas as pd
import numpy as np
import xlrd
 
# data=pd.read_excel('data3.xls')
# print(data)
# print(data.dtypes)
# 1
# data['id']=data['id'].astype(str)
# print(data.dtypes)
#
# data['custom_amt']=data['custom_amt'].astype(str)
# print(data.dtypes)
 
# df=pd.DataFrame([['green','M',10.1],['red','XL',15.6],['blue','L',20]])
# print(df)
# df.columns=['color','size','price']
# print(df)
# ma={'XL':3,'L':2,'M':1}
# df['size']=df['size'].map(ma)
# print(df)
#
# data1=pd.DataFrame({'color':['red','green','blue'],'class':['A','B','C']})
# print(data1)
# print(pd.get_dummies(data1,dtype=int))
 
# data=pd.read_csv('detail.csv',encoding='gbk')
# print(data)
# print(data['dishes_name'].drop_duplicates())
# data1={'math':[80,90,70,60,85],'physcice':[85,95,75,70,90]}
# df=pd.DataFrame(data1)
# print(df)
# crr1=df.corr()
# print(crr1)
 
 
np.random.seed(10)
data=np.random.randint(0,100,200)
# print(data)
d1=pd.qcut(data,6)
print(d1)
print(d1.value_counts())

2.6数据合并

import pandas as pd
import numpy as np
#encoding=utf-8
# data=pd.read_csv('detail.csv',encoding='gbk')
# data.to_csv("data.csv")
# print(data)
# print(data['dishes_name'])
# print(pd.get_dummies(data['dishes_name'],dtype=int))
 
# df1=pd.DataFrame({'name':['mary','lili','duck'],'age':[25,26,27]})
# df2=pd.DataFrame({'name':['rose'],'age':[30]})
# print(df1)
# print(df2)
# df3=pd.concat([df1,df2],axis=0,ignore_index=True)
# print(df3)
# df4=pd.concat([df1,df2],axis=1,ignore_index=True)
# print(df4)
# print(df4.to_csv('resl.xls'))
 
 
# a1=pd.read_csv('meal_order_info.csv',sep=',',encoding='gbk')
# a2=pd.read_excel('meal_order_detail.xls')
# # print(a1)
# # print(a2)
# print(a1.dtypes)
# # a1['info_id']=a1['info_id'].astype('str')
# # print(a1.dtypes)
# a3=pd.merge(a2,a1,left_on='order_id',right_on='info_id')
# print(a1.shape)
# print(a2.shape)
# print(a3.shape)
# print(a3)
 
data=pd.read_csv('detail.csv',encoding='gbk')
# print(data.head())
# print(data[['counts','amounts']])
# def MinMaxScale(data):
#     return(data-data.min())/(data.max()-data.min())
# a=MinMaxScale(data['counts'])
# b=MinMaxScale(data['amounts'])
# df=pd.concat([a,b],axis=1)
# print(df)
 
def DecimaLScale(data):
    return data/10**(np.ceil(np.log10(data.abs().max())))
a=DecimaLScale(data['counts'])
b=DecimaLScale(data['amounts'])
print(pd.concat([data[['counts','amounts']],a,b],axis=1))

2.7分类汇总

import pandas as pd
import matplotlib.pyplot as plt
 
data=pd.read_csv('HousePrice.csv')
# print(data.head())
# print(data.describe())
# print(data.price.mean())
# print(data.price.median())
# data.price.hist(bins=100)
# plt.show()
 
print(data.price.max()-data.price.min())
print(data.price.std())
print(data.price.var())
print(data.price.skew())
print(data.price.kurtosis())
print(data.AREA.cov(data.price))
print(data.roomnum.value_counts()/data.roomnum.count())
print(data.roomnum.value_counts().mode())
print(data.roomnum.value_counts())
import pandas as pd
import numpy as np
 
time_range=pd.date_range(start='1/1/2019',end='12/31/2022')
# print(time_range)
# print(len(time_range))
fruits=[ '香蕉','苹果', '葡萄' ,'橘子', '哈密瓜' ,'芭乐', '梨子']
fruits_list=np.random.choice(fruits,size=len(time_range),replace=True)
 
# print(fruits_list)
# print(len(fruits_list))
 
names=['张三' ,'李四' ,'王五' ,'赵六' ,'田七', '高八', '刘九']
names_list=np.random.choice(names,size=len(time_range),replace=True)
 
# print(names)
# print(len(names_list))
 
kilo=np.random.choice(list(range(50,100)),size=len(time_range),replace=True)
order=pd.DataFrame({'time':time_range,'fruit':fruits_list,'name':names_list,'kilogram':kilo})
print(order)
 
information=pd.DataFrame({'fruit':fruits,'price':[4,5.6,8.9,2.5,6,7,8],'region':['华南' ,'东北' ,'西南', '华中' ,'东北', '西南' ,'华中']})
print(information)
 
df=pd.merge(order,information,how='outer').sort_values('time').sort_values('time').reset_index(drop=True)
print(df)
 
df['year']=df['time'].dt.year
df['month']=df['time'].dt.month
df['year_month']=df['time'].dt.strftime('%y%m')
 
df1=df.groupby(['year_month'])['kilogram'].sum().reset_index()
print(df1)
 
df['amount']=df['kilogram']*df['price']
print(df.head())
 
df2=df.groupby(['year','region'])['kilogram'].sum().reset_index()
print(df2)
 
df3=df.groupby('year').agg({'kilogram':'sum','amount':'sum'}).reset_index()
print(df3)
 
df3['mean_amount']=df3['amount']/df3['kilogram']
print(df3)

2.8数据透视表

import pandas as pd
 
data={
    'Data':['2021-01-01','2021-01-01','2021-01-02','2021-01-02'],
    'Category':['A','B','A','B'],
    'Value':[10,20,30,40]
}
 
df=pd.DataFrame(data)
print(df)
 
pivit=pd.pivot_table(df,values='Value',index='Data',columns='Category',aggfunc='sum')
print(pivit)
 
 
 
import pandas as pd
 
data={
    'Gender':['Male','Female','Male','Female','Male','Female'],
    'Age':['Youth','Youth','Middle Agen','Middle Agen','Senior','Senior'],
    'Income':[50000,50000,60000,60000,70000,70000]
}
 
df=pd.DataFrame(data)
print(df)
 
 
cross=pd.crosstab(df['Gender'],df["Age"])
print(cross)
 
 
 
import pandas as pd
mudata=pd.read_csv('musicdata.csv')
print(mudata)
 
mupovit=pd.pivot_table(mudata[['format','number_of_records','value_actual']],index='format')
print("以format作为分组键创建的透视表为:\n ",mupovit.head())

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值