使用模块
import pandas as pd
使用pandas读取csv文件
csv_path = r'D:\Code\Other\ant-learn-pandas\datas\ml-latest-small\ratings.csv'
df = pd.read_csv(csv_path)
print(df.head())
print(df.shape)
print(df.columns)
print(df.index)
print(df.dtypes)
使用pandas读取txt文件
txt_path = r'D:\Code\Other\ant-learn-pandas\datas\crazyant\access_pvuv.txt'
df = pd.read_csv(txt_path, sep='\t', header=None, names=['pdate', 'pv', 'uv'])
print(df)
使用pandas读取Excel文件
excel_path = r'D:\Code\Other\ant-learn-pandas\datas\crazyant\access_pvuv.xlsx'
df = pd.read_excel(excel_path, names=['date', 'num1', 'num2'])
print(df)
print(df.head())
print(df['date'][1])
print(df['num1'][[1, 2]])
print(type(df['date'][1]))
print(df.dtypes)
使用pandas读取mysql表
import pymysql
conn = pymysql.connect(host='127.0.0.1', port=3306, user=USERNAME, password=PASSWORD, database=DATABASENAME, charset='utf8')
df = pd.read_sql("select * from company", con=conn)
print(df)
使用pandas中创建Series:方法一
s1 = pd.Series([1, 'a', 5.2, 7])
print(s1)
print(s1.index)
print(s1.values)
使用pandas中创建Series:方法二
s2 = pd.Series([1, 'a', 5.2, 7], index=['a', 'b', 'c', 'd'])
print(s2)
print(s2.index)
print(s2['a'])
print(type(s2['c']))
print(s2[['b', 'd']])
print(type(s2[['b', 'd']]))
使用pandas中创建Series:方法三
sdata = {'Ohio': 35000, 'Teaxs': 72000, 'Oregon': 16000, 'Utah': 5000}
s3 = pd.Series(sdata)
print(s3)
使用pandas中创建DataFrame
data = {
'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2003, 2004],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9],
}
df = pd.DataFrame(data)
print(df)
print(df.dtypes)
print(df.index)
print(df.columns)
print(df['year'])
print(type(df['year']))
print(df[['year', 'pop']])
print(type(df[['year', 'pop']]))
print(df.loc[-1:])
print(df.tail(1))
print(df[-1:])
print(type(df.loc[1]))
print(df.loc[1:2])
print(type(df.loc[[1, 2]]))
使用pandas中的五种查询方式
tianqi_path = r'D:\Code\Other\ant-learn-pandas\datas\beijing_tianqi\beijing_tianqi_2018.csv'
df = pd.read_csv(tianqi_path)
print(tianqi.head())
df.set_index('ymd', inplace=True)
print(df.index)
df.loc[:, 'bWendu'] = df['bWendu'].str.replace('℃', '').astype('int32')
df.loc[:, 'yWendu'] = df['yWendu'].str.replace('℃', '').astype('int32')
print(df.head())
"""================一:使用单个label值查询数据======================="""
print(df.loc['2018-01-03', 'bWendu'])
print(df.loc['2018-01-03', ['bWendu', 'yWendu']])
"""================二:使用值列表批量查询======================="""
print(df.loc[['2018-01-03', '2018-01-04', '2018-01-05'], 'bWendu'])
print(df.loc[['2018-01-03', '2018-01-04', '2018-01-05'], ['bWendu', 'yWendu']])
"""================三:使用数值区间进行范围查找======================="""
print(df.loc['2018-01-03':'2018-01-05', 'bWendu'])
print(df.loc['2018-01-03', 'bWendu':'yWendu'])
print(df.loc['2018-01-03':'2018-01-05', 'bWendu':'yWendu'])
"""================四:使用条件表达式查询======================="""
print(df.loc[tianqi['yWendu'] < -10, :])
if_ = (df['bWendu'] <= 30) & (df['yWendu'] >= 15) & (df['tianqi'] == '晴') & (df['aqiLevel'] == 1)
print(df.loc[if_, :])
"""================五:调用函数查询======================="""
print(df.loc[lambda df:(df['bWendu'] <= 30) & (df['yWendu'] >= 15), :])
def query_my_data(df):
return df.index.str.startswith('2018-09') & df['aqiLevel'] == 1
print(df.loc[query_my_data(tianqi), :])
使用pandas中的四种赋值方法
tianqi_path = r'D:\Code\Other\ant-learn-pandas\datas\beijing_tianqi\beijing_tianqi_2018.csv'
df = pd.read_csv(tianqi_path)
print(df.head())
df.set_index('ymd', inplace=True)
print(df.index)
"""================一:直接赋值======================="""
df.loc[:, 'bWendu'] = df['bWendu'].str.replace('℃', '').astype('int32')
df.loc[:, 'yWendu'] = df['yWendu'].str.replace('℃', '').astype('int32')
df.loc[:, 'wencha'] = df['bWendu'] - df['yWendu']
df['wencha'] = df['bWendu'] - df['yWendu']
print(df.head())
"""================二:df.apply方法======================="""
def get_wen_type(x):
if x['bWendu'] > 33:
return'高温'
elif x['yWendu'] < -10:
return '低温'
else:
return '常温'
df['wentype'] = df.apply(get_wen_type, axis=1)
print(df.head())
print(df['wentype'].value_counts())
"""================三:df.assign方法======================="""
print(df.assign(
yWendu_huashi = lambda x : x['yWendu'] * 9 / 5 + 32,
bWendu_huashi = lambda x : x['bWendu'] * 9 / 5 + 32
))
"""================四:按条件选择分组分别赋值======================="""
df['wencha_type'] = ''
df.loc[df['bWendu'] - df['yWendu'] > 10, 'wencha_type'] = '温差大'
df.loc[df['bWendu'] - df['yWendu'] <= 10, 'wencha_type'] = '温度正大'
print(df.head())
print(df['wencha_type'].value_counts())
pandas的数据统计函数
tianqi_path = r'D:\Code\Other\ant-learn-pandas\datas\beijing_tianqi\beijing_tianqi_2018.csv'
df = pd.read_csv(tianqi_path)
df['bWendu'] = df['bWendu'].str.replace('℃', '').astype('int32')
df['yWendu'] = df['yWendu'].str.replace('℃', '').astype('int32')
print(df.head())
"""================一:汇总类统计======================="""
print(df.describe())
print(df['bWendu'].mean())
print(df['bWendu'].min())
print(df['bWendu'].max())
"""================二:唯一去重和按值计数======================="""
print(df['fengxiang'].unique())
print(df['tianqi'].unique())
print(df['fengli'].unique())
print(df['fengxiang'].value_counts())
print(df['tianqi'].value_counts())
print(df['fengli'].value_counts())
"""================三:相关系数与协方差======================="""
'''
用途:1、两只股票是不是同涨同跌,程度多大,正相关还是负相关;2、产品向量的波动,跟那些因素正相关,负相关,程度有多大
1、协方差:衡量同向反向程度,如果协方差为正,说明X,Y同向变化,协方差越大说明同向程度越高,如果协方差为负,说明X,Y反向运动,协方差越小,说明
反向程度越高
2、相关系数:衡量相似度程度,当他们的相关系数为1时,说明两个变量变化时的正向相似度最大,当相关系数为-1时,说明两个变量变化的反向相似度最大
'''
print(df.cov())
print(df.corr())
print(df['aqi'].corr(df['bWendu']))
print(df['aqi'].corr(df['yWendu']))
print(df['aqi'].corr(df['bWendu'] - df['yWendu']))
pandas对缺失值的处理
'''
Pandas使用这些函数处理缺失值:
1、isnull和notnull:检测是否是空值,可用于df和Series
2、dropna:丢弃,删除缺失值
1、axis:删除行还是列,{0 or 'index', 1 or 'columns'}, default 0
2、how:如果等于any,则任何值为空都删除,如果等于all,则所有数值都为空才删除
3、inplace:如果为True,则修改当前df,否则返回新的df
3、fillna:填充空值
1、value:用于填充的值,可以是单个值,或者字典(key是列名,value是值)
2、method:等于ffill使用前一个不为空的值填充forword fill;等于bfill使用后一个不为空的值填充backword fill
3、axis:按行还是按列填充,{0 or 'index', 1 or 'columns'}
4、inplace:如果为True,则修改当前df,否则返回新的df
'''
student_path = r'D:\Code\Other\ant-learn-pandas\datas\student_excel\student_excel.xlsx'
df = pd.read_excel(student_path, skiprows=2)
print(df)
print(df.isnull())
print(df.notnull())
print(df['分数'].isnull())
print(df.loc[df['分数'].notnull(), :])
df.dropna(axis='columns', how='all', inplace=True)
df.dropna(axis='index', how='all', inplace=True)
df['分数'] = df['分数'].fillna(0)
df['姓名'] = df['姓名'].fillna(method='ffill')
df.to_excel(r'D:\Code\Other\ant-learn-pandas\datas\student_excel\student.xlsx', index=False)
print(df)
pandas的SwttingWithCopyWarning报警
'''
总之,pandas不允许先帅选子DataFrame,再进行修改写入
要么使用.loc实现一个步骤直接修改源DataFrame
要么先复制一个子DataFrame再一个步骤执行修改
'''
df_path = r'D:\Code\Other\ant-learn-pandas\datas\beijing_tianqi\beijing_tianqi_2018.csv'
df = pd.read_csv(df_path)
df['bWendu'] = df['bWendu'].str.replace('℃', '').astype('int32')
df['yWendu'] = df['yWendu'].str.replace('℃', '').astype('int32')
condition = df['ymd'].str.startswith('2018-03')
df[condition]['wencha'] = df['bWendu'] - df['yWendu']
print(df[condition].head())
"""================解决方法一======================="""
df.loc[condition, 'wendcha'] = df['bWendu'] - df['yWendu']
print(df[condition].head())
"""================解决方法二======================="""
new_df = df[condition].copy()
new_df['wencha'] = new_df['bWendu'] - new_df['yWendu']
print(new_df)
pandas对数据进行排序
'''
Series的排序:
series.sort_values(ascending=True, inplace=Flase)
参数说明:
1、ascending:默认为True升序排序,为False降序排序
2、inplace:是否修改原始series
DataFrame的排序:
DataFrame.sort_values(by, ascending=True, inplace=False)
参数说明:
1、by:字符串或者List<字符串>,单列排序或者多列排序
2、ascending:默认为True升序排序,为False降序排序
3、inplace:是否修改原始series
'''
df_path = r'D:\Code\Other\ant-learn-pandas\datas\beijing_tianqi\beijing_tianqi_2018.csv'
df = pd.read_csv(df_path)
df.loc[:, 'bWendu'] = df['bWendu'].str.replace('℃', '').astype('int32')
df.loc[:, 'yWendu'] = df['yWendu'].str.replace('℃', '').astype('int32')
"""================对Series排序======================="""
print(df['aqi'].sort_values(ascending=False))
print(df.loc[:, 'aqi'].sort_values())
print(df['tianqi'].sort_values())
"""================对DataFrame排序======================="""
print(df.sort_values(by='bWendu', ascending=False))
print(df.sort_values(by=['aqiLevel', 'bWendu'], ascending=False))
print(df.sort_values(by=['aqiLevel', 'bWendu'], ascending=[True, False]))