numpy基础
import numpy as np
arr = np.array([1, 2, 4, 5])
print(type(arr))
print(arr)
print(np.nan)
print(np.where(arr>2))
print(arr[np.where(arr>2)[0]])
pandas读取数据
import pandas as pd
def create_dict_df():
data = {'jobname': ['大数据开发工程师', '数据分析师', 'Python爬虫分析师', '大数据运维工程师'],
'salary': [20000, 18000, 16000, 17000], 'city': ['北京', '北京', '上海', '郑州']}
df = pd.DataFrame(data)
print(type(df))
print(df)
df.to_excel('data.xlsx')
df.to_csv('data.csv')
def df_view():
path = 'salaryinfo.xlsx'
df = pd.read_excel(path)
df = df[['jobname','salary','city']]
print(df)
def df_view2():
path = 'salaryinfo_sheet.xlsx'
# df = pd.read_excel(path,sheet_name=[0,1,2])
# df = pd.read_excel(path,sheet_name=['北京','上海'])
dict1 = pd.read_excel(path,sheet_name=None)
print('***')
print(dict1)
df = pd.concat(dict1,ignore_index=True)
print(df)
print(type(df))
def df_view3():
path = 'salaryinfo_sheet.xlsx'
df = pd.read_excel(path,usecols=['jobname','salary','city'],
converters={'salary':salary_format})
print(df)
print(type(df))
def salary_format(salary):
return f'{salary}元'
if __name__ == '__main__':
# create_dict_df()
# df_view3()
df_view()
DataFrame操作
import pandas as pd
def df_view():
path = 'salaryinfo.xlsx'
df = pd.read_excel(path)
print(df)
# df = df[['jobname','salary','city']]
# df = df['jobname']
# print(df)
high_data = df[(df['salary'] >= 15000) & (df['salary'] < 20000)][['jobname', 'salary']]
print(type(high_data))
print(high_data)
def df_view1():
path = 'salaryinfo_sheet.xlsx'
df = pd.read_excel(path)
# 二维结构维度信息
print(df.shape)
# 查看df的信息(列、空值信息、列数据类型)
print(df.info())
# 列数据类型
print(df.dtypes)
# 查看空值
print(df.isnull())
# 唯一值查看
print(df['exp'].unique())
# 查看值数据
print(df.values)
# 查看列名
print(df.columns)
# 查看前10条数据
print(df.head(10))
# 查看末尾5条数据
print(df.tail())
# 数据筛选
print(df[df['edu'] == '本科以上'])
print(df.loc[df['edu'] == '本科以上'])
# 获取数据
print(df.loc[0])
# 遍历数据
for index, row in df.iterrows():
print(row)
print('岗位名称:', row['jobname'])
if __name__ == '__main__':
# create_dict_df()
df_view1()
综合案例
import pandas as pd
import matplotlib.pyplot as plt
from pyecharts import options as opts
from pyecharts.charts import Map
def df_deal():
path = 'salaryinfo_sheet.xlsx'
df = pd.read_excel(path, sheet_name=None)
df = pd.concat(df, ignore_index=True)
# print(df.head())
# df = df.dropna()
df['edu'] = df['edu'].fillna('未知')
df['edu'] = df['edu'].map(str.strip)
# print(df.head())
# df['edu'] = df['edu'].apply(lambda x:x.strip())
df['jobname'] = df['jobname'].apply(lambda x: x.lower())
df = df.rename(columns={'jobname': '岗位名称'})
# print(df.head())
# df = df.drop_duplicates()
# print(df['city'].drop_duplicates())
# df.drop([])
df.drop('company', axis=1, inplace=True)
print(df)
df['edu'] = df['edu'].replace('本科以上', '本科')
df['city'] = df['city'].replace('郑州', '河南')
print(df)
df = df.sort_values(by=['salary'], ascending=False)
print(df)
df_stat = df.groupby('city')['salary'].mean()
print(df_stat)
plt.rcParams['font.sans-serif'] = ['simhei']
plt.title('不同城市大数据平均薪资')
plt.pie(df_stat.values,labels=df_stat.index)
# plt.bar(df_stat.index,df_stat.values)
# plt.scatter(df_stat.index, df_stat.values)
plt.show()
city_ = map_visualmap(df_stat.index,df_stat.values)
city_.render(path="test_map_1.html")
def map_visualmap(provinces, value) -> Map:
c = (
Map()
.add("", [list(z) for z in zip(provinces, value)], "china")
.set_global_opts(
title_opts=opts.TitleOpts(title="不同省份大数据平均薪资分级设色图"),
visualmap_opts=opts.VisualMapOpts(max_=2000),
)
)
return c
if __name__ == '__main__':
# create_dict_df()
df_deal()