series的使用
from pandas import Series
x = Series(
data=['a','b','fef'],
index=['aa','bb','cc']
)
print(x['cc'])
x['aa']='dwdw'
x = x.drop('cc')
x.append((Series(data=['dwd'],index=['r'])))
print(x.index,x.values)
dataframe的基本使用
from pandas import DataFrame
df = DataFrame(
data={
'name':['张三','李四','王五'],
'age':[18,20,19],
'hobby':['唱歌','跳舞','rap']
},
index=['first','second','third']
)
print(df)
print(df['name'])
df.loc['first','age']
print(df.loc[:,'age'])
print(df.loc['first':'third','name':'hobby'])
print(df.loc[['first','third'],['name','hobby']])
print(df.iloc[0:2,0:2])
print(df.iloc[[0,2],[0,2]])
df写入csv
from pandas import DataFrame
df = DataFrame(
data=[
{'name':'张三','age':12},
{'name':'栗子','age':18},
{'name':'王五', 'age':15}
]
)
df.to_csv('哈哈.csv',index=False,header=False)
pandas的基本处理方法
示例数据链接
import pandas as pd
1. 读取csv数据
df = pd.read_csv('数据分析data/data.csv')
print(df)
2. 去重
df.drop_duplicates(inplace=True)
print(df)
3 空值处理
df = pd.read_csv('数据分析data/data1.csv')
df.dropna(inplace=True)
df.fillna('Not provide', inplace=True)
print(df)
4 某列数据的字符串操作(替换、去空格、切割等等)
df = pd.read_csv('数据分析data/data2.csv')
df['name'] = df['name'].str.replace('JIMI','TOM').str.strip()
print(df)
5 筛选数据
df = pd.read_csv('数据分析data/data4.csv',sep='|')
print(df.columns)
print(df[df['comments']>=10000])
print(df[(df['comments']>=1000) & (df['comments']<=10000)])
print(df[(df['comments']>=1000) | (df['comments']>=10000)])
print(df[df['title'].str.contains('台电', na=False)])
6 数据合并
from pandas import DataFrame
import pandas as pd
df1 = pd.read_csv('数据分析data/data6-1.csv',sep='|')
df2 = pd.read_csv('数据分析data/data6-2.csv',sep='|')
df3 = pd.read_csv('数据分析data/data6-3.csv',sep='|')
df = pd.concat([df1,df2,df3])
df.reset_index(inplace=True, drop=True)
print(df)
df1 = pd.read_csv('数据分析data/data7-1.csv',sep='|', names=['id','comments','title'])
df2 = pd.read_csv('数据分析data/data7-2.csv',sep='|',names=['id','comments','title'])
df = pd.merge(df1,df2,on='id')
print(df)
df = pd.read_csv('数据分析data/data8.csv',sep='|')
df['totalPrice'] = df['num']*df['price']
print(df)
分组聚合
import pandas as pd
df = pd.read_csv('数据分析data\groupby.csv')
res1 = df.groupby('address')['id'].count().reset_index()
res2 = df.groupby('address')['score'].mean().reset_index()
res3 = df.groupby('address')['score'].agg(['mean','max','min','sum','median']).reset_index()
print(res3)
res4 = df.groupby('address').agg({'score':'mean','age':['max','min']}).reset_index()
print(res4)
import pandas as pd
df = pd.read_csv('D:\python_project\python爬虫\chromedriver\lianjia_data.csv')
temp = df.groupby('community')['unit_price'].agg(['mean','count']).reset_index()
result = [
[
value['community'],
round(value['mean']/10000,1)
]
if value['count'] >= 3 else [value['community'],0]
for index,value in temp.iterrows()
]
result = sorted(result,key=lambda x:x[1],reverse = True)[0:10]