t001临时笔记_pandas_后面整理
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymysql
warnings.filterwarnings('ignore')
plt.rcParams["font.sans-serif"] = ["SimHei"] # 设置字体
plt.rcParams["axes.unicode_minus"] = False # 该语句解决图像中的“-”负号的乱码问题
# class duiqi():
# pd.set_option('max_rows', None) # 显示最多行数
# pd.set_option('max_columns', None) # 显示最多列数
# pd.set_option('expand_frame_repr', False) # 当列太多时显示不清楚
# pd.set_option('display.unicode.east_asian_width', True) #设置输出右对齐
def read_csv():
fpath = r"C:\Users\30767\Desktop\数据.csv"
data01 = pd.read_csv(fpath,
# sep=',', # 指定分隔符
# header=None, # 没有标题行
# names=[1, 2, 3, 4, 5, 6, 7, 8], # 添加标题行
)
print(data01.head(), # 查看前几行,可加参数
data01.shape, # 查看行列
data01.columns, # 列名
data01.index, # 索引
data01.dtypes, # 数据类型
sep='\n',
)
def read_mysql(table):
conn = pymysql.Connect(
host='127.0.0.1',
port=3306,
user='root',
password='0000',
database='db1',
charset='utf8',
)
mysql_page = pd.read_sql(f'select * from {table}', con=conn)
conn.close()
print(mysql_page)
def write_to_mysql():
from sqlalchemy import create_engine
engin = create_engine('mysql+pymysql://root:0000@localhost:3306/db1')
data02 = pd.read_csv(r"C:\Users\30767\Desktop\数据01.csv")
data02.to_sql('db102', engin, chunksize=2, index=False)
print('success')
def get_series01():
s1 = pd.Series([1, 'q', [1, 2, 3], 20030102, 7], # 列表创建series,series一维数组,代表一行或一列,有索引和值
index=['a', 'b', 'c', 'd', 'e'], # 指定索引,默认0,1,2...
)
print(s1, s1.values,
s1.index,
sep='\n',
)
s2 = pd.Series({'s': 1, 'w': 2, 'r': 3}) # 通过字典创建
print(s2,
s2['w'], # 单标签查询返回值
s2[['r', 's']], # 多标签查询返回series
sep='\n'
)
def get_dataframe():
data = {
'name': ['mac', 'bob', 'alice', 'jack'],
'age': [10, 24, 17, 40],
'where': ['china', 'america', 'jap', 'indea'],
}
data03 = pd.DataFrame(data)
print(
data03,
data03.dtypes,
data03.index,
data03.columns,
data03['name'], # 查询一列,行为index
type(data03['name']),
data03[['age', 'where']], # 查询多列,返回dataframe对象
type(data03[['age', 'where']]),
data03.loc[2], # 查询一行,列为index
data03.loc[1:3], # 查询多行,包含结束列
sep='\n',
)
def df_chaxun():
# dfmi.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :]
# idx = pd.IndexSlice
# dfmi.loc[idx[:, :, ["C1", "C3"]], idx[:, "foo"]]
# dfmi.loc["A1", (slice(None), "foo")]
# mask = dfmi[("a", "foo")] > 200
# dfmi.loc[idx[mask, :, ["C1", "C3"]], idx[:, "foo"]]
df = pd.read_csv(r"C:\Users\30767\Desktop\数据01.csv")
df.set_index('ymd', inplace=True) # 设置新的索引,是否创建新对象
print(
df.index,
df.dtypes,
sep='\n'
)
# 查询替换数据,更改文本数据类型为int32
df.loc[:, '体温'] = df['体温'].str.replace('℃', '').astype('int32')
print(
df.dtypes,
'行列指定查询',
df.loc['2022/1/3', '体温'],
df.loc[['2022/1/3', '2022/1/2'], '体温'],
df.loc['2022/1/3', ['体温', 'id']],
df.loc[['2022/1/3', '2022/1/2'], ['体温', 'id']],
'行和列的切片查询',
df.loc['2022/1/2':'2022/1/4', 'id':'体温'],
'高级条件查询',
df.loc[(df['体温'] < 38) & (df['score'] > 13), :], # 前面返回True/False
sep='\n'
)
def df_tianlie():
df = pd.read_csv(r"C:\Users\30767\Desktop\数据01.csv")
# 1.直接添加列
df.loc[:, 'num'] = df['id'] + 1
# 2.通过apply添加,这里注意
def dafen(x):
if x['score'] > 10:
return '优秀'
if x['score'] < 5:
return "垃圾"
return '正常'
df.loc[:, '打分'] = df.apply(dafen, axis=1) # 0是一维,1,2递增
# 3.df.assign增加多个列,生成新的df
df2 = df.assign(
f1=lambda x: x['score'] * 10,
f2=lambda x: x['score'] * 0.1 + 100
)
# 4.按条件选择分组赋值
df2['差'] = '' # 广播语法,填充每行
df2.loc[df2['f2'] - df2['f1'] > 30, '差'] = '差较大'
df2.loc[df2['f2'] - df2['f1'] <= 30, '差'] = '差较小'
print(
df.columns,
df2,
df['打分'].value_counts(), # 打分列的透视,统计数量
sep='\n'
)
def data_des():
df = pd.read_csv(r"C:\Users\30767\Desktop\数据01.csv")
print(
df,
df.describe(), # 数据类型描述
df['score'].max(), # 单列描述
df['score'].mean(),
df['score'].min(),
df['国家'].unique(), # 唯一去重,分类透视
df['国家'].value_counts(), # 计数,降序排列
df.cov(), # 协方差矩阵,变化方向的一致性描述,0无关
df.corr(), # 相关系数矩阵,相关性描述
df['score'].corr(df['身高']), # 相关性,举个例子,没干系哈
sep='\n'
)
def i_df_sort():
df = pd.read_csv(r"C:\Users\30767\Desktop\数据01.csv")
print(
df.head(),
df['身高'].sort_values(ascending=False), # True升默认,False降,可加inpalce
df.sort_values(by='身高', ascending=False),
'*' * 50,
df.sort_values(by=['身高', 'id'], ascending=[1, False]),
'*' * 50,
df.sort_values(by=['身高', 'id'], ascending=[True, False]),
sep='\n'
)
def i_df_str():
'''series的字符类型才有str属性,数据类型没有,会报错,df没有,是在str上调用方法,不同于py原生str'''
df = pd.read_csv(r"C:\Users\30767\Desktop\数据01.csv")
print(
df,
df['体温'].str,
df['体温'].str.replace('℃', ''), # 替换字符
df['体温'].str.isnumeric(), # 不是数字
df['体温'].str.len(),
df['体温'].str.startswith('4'), # 什么开头
# str是serirs的属性,str方法返回series,再次调用str属性
df['ymd'].str.replace('/', '').str.slice(4, 6),
df['ymd'].str.replace('/', '').str[4:6], # 切片语法
sep='\n',
)
def i_df_re():
df = pd.read_csv(r"C:\Users\30767\Desktop\数据01.csv")
def f(x):
y, m, d = x['ymd'].split('/')
return f'{y}年{m}月{d}日'
df['日期'] = df.apply(f, axis=1, )
print(
df,
df['日期'].str.replace('年', '').str.replace(
'月', '').str.replace('日', ''),
df['日期'].str.replace('[年月日]', ''), # 正则表达式,遇到[]中都会替换
sep='\n'
)
def i_df_axis():
data = np.arange(12).reshape(3, 4) # 构造矩阵
df = pd.DataFrame(data, columns=['a', 'b', 'c', 'e'])
df1 = pd.DataFrame(data, columns=list('qwer'))
print(
data,
df,
df1,
'*' * 40,
sep='\n'
)
def m(x):
return x['a'] + x['b'] + x['c'] + x['e']
df['mean'] = df.apply(m, axis=1) # 左右梳理
print(
df,
df.drop('b', axis=1), # 删除列
df.drop(1, axis=0), # 删除行
df.mean(axis=0), # 从上到下跨行梳理
df.mean(axis=1), # 左到右跨列梳理
sep='\n'
)
def i_df_index():
df = pd.read_csv(
r"E:\study\programme\python\py_1_test\file\jm_data.txt", encoding='utf-8')
df.set_index('序号', inplace=True, drop=False) # 修改某列索引,在原表,此列从数据不清除
print(
df,
# df.columns,
# df.index,
df.loc[10], # index查询作用,提升效率3点优势,1唯一索引极快,2有序索引次快,3无需索引指数级增加
df.loc[df['序号'] == 10],
sep='\n'
)
df1 = pd.Series([1, 2, 3], index=list("abc"))
df2 = pd.Series([2, 3, 44], index=list("bcd"))
print(
'*' * 60,
df1,
df2,
df1 + df2, # 自动对其,丢失数据
sep='\n'
)
# 有更加强大的数据结构的支持,multindex,datatimeindex,categoricalindex
def i_df_merge():
pass
def i_df_connect():
pass
def i_df_pt():
df = pd.read_csv(
r"E:\study\programme\python\py_1_test\file\jm_data.txt", encoding='utf-8')
tb = pd.pivot_table(df, index='序号')
print(
tb
)
def i_df_group():
df = pd.read_excel(r'E:\study\programme\python\py_1_test\file\d.xlsx')
g = df.groupby('类型') # 先分组,再求值
for group_name, group in g:
print(group_name, group, sep='\n')
g1 = df.groupby(['周几', '类型']) # 元组分组,再对分别df操作
for x, y in g1:
print(x)
print(y)
print()
for x, y in g1['单价']: # 可以遍历单价列的series,抽取一列
print(x)
print(y)
print(type(y))
print()
for x, y in g1[['单价', '金额']]: # 抽取两列
print(x)
print(y)
print(type(y))
print()
print(
'-' * 50,
g.get_group('蔬菜'), # 获得某组
g1.get_group((4, '蔬菜')), # 筛选出某元组内容
'-' * 50,
sep='\n'
)
print(
df,
df.groupby('类型').sum(), # 分组成为索引,其他数据列计算,非数据列丢失
df.groupby(['周几', '类型']).mean(), # 多列分组成为索引,其他数据列计算,非数据列丢失
# 多列分组不成为索引,其他数据列计算,非数据列丢失
df.groupby(['周几', '类型'], as_index=False).mean(),
# 分组成为索引,其他数据列计算多个函数,非数据列丢失
df.groupby('类型').agg([np.sum, np.mean, np.std]),
df.groupby('类型')['金额'].agg([np.sum, np.mean, np.std]), # 筛选一列再计算,性能高
df.groupby('类型')['金额'].agg([np.sum, np.mean, np.std]), # 计算后筛选一列
df.groupby('类型').agg({'周几': np.mean, '金额': np.mean}), # 分组后不同列,不同的计算
'-' * 50,
g['金额'].sum(),
g['金额'].sum().plot(),
g.agg({'数量': np.sum, '单价': np.mean, '金额': np.mean}).plot(),
plt.show(),
sep='\n',
)
def i_df_multindex_1():
df = pd.read_excel(r'E:\study\programme\python\py_1_test\file\d.xlsx')
ser = df.groupby(['类型', '周几'])['数量'].mean()
ser1 = df.groupby(['类型', '周几']).agg(
{'单价': np.mean, '金额': np.sum, '数量': np.max}),
df1 = df.set_index(['周几', '类型'])
print(
df.shape,
df.head(),
df['类型'].unique(),
df.index,
df.groupby('类型')['数量'].mean(),
df.groupby(['类型', '周几'])['数量'].mean(),
'-' * 50,
ser.index, # MultiIndex索引
ser.unstack(), # 降低索引层次,双索引变为单索引,逆透视
ser.reset_index(), # 原来双索引直接成为df,新索引0....
ser.loc['水果'], # 筛选一级索引
ser.loc[('水果', 1)], # 直接筛选一二级元组索引
ser.loc[(['水果', '蔬菜'], [1, 2])], # 直接筛选一二级元组索引
ser1,
ser.loc[:, 1], # 只筛选二级索引,一级索引忽略
'-' * 80,
sep='\n'
)
def i_df_multindex_2():
df = pd.read_excel(r'E:\study\programme\python\py_1_test\file\d.xlsx')
df1 = df.set_index(['周几', '类型'])
df1.sort_index(inplace=True)
print(
'-' * 50,
df1,
df1.index,
df1.loc[1],
df1.loc[(1, '水果')],
df1.loc[(1, '水果'), :], # 和上面相同
df1.loc[(1, '水果'), '产品'], # 上面查找某一类索引
'*' * 80,
df1.loc[[2, 3], :], # 查找某几条数据
df1.loc[[2, 3]], # 查找某几条数据,和上同
'()' * 90,
df1.loc[([2, 3], '蔬菜'), :], # 查找某几条数据,先找某几条数据,再索引内容
df1.loc[([2, 3], '蔬菜'), '单价'], # 查找某几条数据,先找某几条数据,再索引内容
df1.loc[([2, 3], '蔬菜'), ['数量', '金额']], # 查找某几条数据,先找某几条数据,再索引内容
df1.loc[([4], ['蔬菜', '水果']), ['数量', '金额']], # 查找某几条数据,先找某几条数据,再索引内容
df1.loc[([4, 5], ['蔬菜', '水果']), ['数量', '金额']], # 查找某几条数据,先找某几条数据,再索引内容
'#' * 90,
df1.loc[(slice(None), ['蔬菜', '水果']), :], # 切片无终点双索引,忽略前,直接索引后
sep='\n',
)
def df_fix():
df = pd.read_excel(r'E:\study\programme\python\py_1_test\file\d.xlsx')
dic = {
'水果': 'shuiguo',
'肉': 'rou',
'蔬菜': 'shucai'
}
df['拼音1'] = df['类型'].map(dic) # 值映射,依据行信息添加信息,可以跟字典/函数,只能处理series
df['pinin2'] = df['类型'].map(lambda x: dic[x].upper())
df['pinin3'] = df['类型'].apply(
lambda x: dic[x].upper()) # 只能跟函数,可以处理series/df
df['pinin4'] = df.apply(lambda x: dic[x['类型']].upper(),
axis=1) # 处理df,后面就是series,指定轴方向
# 一次性添加多列,传出元组,目标列表接收,resule_type='expend'
def add_columns(df):
return df['数量'] * df['单价'], df['数量'] * df['单价'] - df['金额']
df[['first', 'second']] = df.apply(
add_columns, axis=1, result_type='expand')
df1 = df[['数量', '单价', '金额']] # 筛选出一个区域
df2 = df1.applymap(lambda x: int(x)) # 处理这个df,只能处理df
df.loc[:, ['数量', '单价', '金额']] = df2 # 替换原来的部分
print(
df.head(),
df['类型'].unique(),
df1.head(),
df2.head(),
sep='\n',
)
def group_apply():
"""分组后操作df"""
df = pd.read_excel(r'E:\study\programme\python\py_1_test\file\d.xlsx')
df1 = df.groupby(['周几', '类型']).sum()
def to_1(df, topn):
'''归一化,分组处理后返回df的拼接'''
max_values = df['单价'].max()
min_values = df['单价'].min()
df['guiyi'] = df['单价'].apply(lambda x: (
x - min_values) / (max_values - min_values))
df1 = df.sort_values(by='金额')[['产品', '数量', '单价']][-topn:] # 倒序,倒着取
return df1
n = 2
df2 = df.groupby(['周几', '类型'], group_keys=True).apply(to_1, topn=n) # 保留索引
df3 = df.groupby(['周几', '类型'], group_keys=False).apply(to_1, topn=n)
df4 = df.groupby(['周几', '类型'], as_index=True).apply(
to_1, topn=n) # sql-style
df5 = df.groupby(['周几', '类型'], as_index=False).apply(to_1, topn=n)
print(
df1,
df2,
df3,
df4,
df5,
sep='\n'
)
def df_toushi():
df = pd.read_excel(r'E:\study\programme\python\py_1_test\file\d.xlsx')
df_group = df.groupby(['周几', '类型'])['金额'].agg(
pv=np.sum) # 分组,计算列,统计次数
# df_group = df.groupby(['周几', '类型'])['金额'].agg(pv=np.size) # 分组,计算列,统计次数
# 透视,提高维度,拆分堆叠,把最内层行索引转换为列索引
df_unstack = df_group.unstack(level=-1, fill_value=None)
df_unstack.plot() # 索引1为x轴成为行索引,pv为y轴成为列索引,索引2为图例
df_reset = df_group.reset_index() # 重置行索引,原索引进入df
df_pivot = df_reset.pivot('周几', '类型', 'pv') # x,图例,y,和上面df_unstack(),结果一样
df_pivot.plot()
print(
df.sort_values(by=['类型', '周几']).head(), # 排序
df_group,
df_unstack,
df_unstack.stack(level=-1, dropna=True), # 逆透,视,降维堆叠,删除空值且列索引转换为内层行索引
df_reset,
df_pivot,
sep='\n',
)
plt.show()
def df_MultiIndex():
"""创建多级索引,3种方式"""
# Creating an array of names
arrays = ['Sohom', 'Suresh', 'kumkum', 'subrata']
# Creating an array of ages
age = [10, 11, 12, 13]
# Creating an array of marks
marks = [90, 92, 23, 64]
# Using MultiIndex.from_arrays, we are
# combining the arrays together along
# with their names and creating multi-index
# with each element from the 3 arrays into
# different rows
index1 = pd.MultiIndex.from_arrays([arrays, age, marks], names=('names', 'age', 'marks'))
print(index1)
# Creating data
data = {'name': ["Saikat", "Shrestha", "Sandi", "Abinash"],
'Jobs': ["Software Developer", "System Engineer",
"Footballer", "Singer"],
'Annual Salary(L.P.A)': [12.4, 5.6, 9.3, 10]}
# Dataframing the whole data
df1 = pd.DataFrame(data)
# Showing the above data
print(df1)
# creating multiple indexes from
# the dataframe
index2 = pd.MultiIndex.from_frame(df1)
print(index2)
data1 = {
'series': ['Peaky blinders', 'Sherlock', 'The crown',
'Queens Gambit', 'Friends'],
'Ratings': [4.5, 5, 3.9, 4.2, 5],
'Date': [2013, 2010, 2016, 2020, 1994]
}
df2 = pd.DataFrame(data1)
df2.set_index(['series', 'Ratings'], inplace=True, append=True, drop=False) # 添加index
print(df2)
index3 = pd.MultiIndex.from_tuples([
('2023-01-01', 0, 'A'),
('2023-01-01', 1, 'B'),
('2023-01-01', 2, 'C'),
('2023-01-02', 0, 'D'),
('2023-01-02', 1, 'E'),
('2023-01-02', 2, 'F'),
('2023-01-03', 0, 'G'),
('2023-01-03', 1, 'H'),
('2023-01-03', 2, 'I')
], names=['时间', '整数', '字符串'])
print(index3)
def df_reorder_levels(df):
"""调整多级索引次序"""
print(
df[:5].swaplevel(0, 1, axis=0),
df[:5].reorder_levels([1, 0], axis=0),
)
pass
def df_replace():
"""替换df中的值"""
pass
def df_asfo():
"""依据索引返回行,找不到返回Nan"""
pass
def df_aggregate():
"""聚合函数"""
pass
def df_assign():
"""创建计算列和apply差不多"""
pass
def df_xs(df):
"""多级索引查找"""
print(
df.xs("one", level="second"),
df.loc[(slice(None), "one"), :],
df.xs(("one", "bar"), level=("second", "first"), axis=1),
sep='\n'
)
def df_aligen():
"""df索引应用对其"""
pass
def df_rename(df):
"""多级索引"""
print(
df.rename(columns={0: "col0", 1: "col1"}),
df.rename(index={"one": "two", "y": "z"}),
df.rename_axis(index=["abc", "def"]), # 索引起名
)
def df_sort_index(s):
"""多级索引排序"""
print(
s.sort_index(),
s.sort_index(level=0),
s.sort_index(level=1),
s.T.sort_index(level=1, axis=1),
s.set_index(["jim", "joe"]),
)
def df_take(df):
"""take和iloc很象"""
pass
if __name__ == '__main__':
df_MultiIndex()