DataFrame 之间的合并、连接
- merge 通过键拼接列
- join 拼接列
- concat 可以沿着一条轴将多个对象堆叠到一起
添加数据
- append row增加,官方文档
- 增加列
dates = pd.date_range('20121001',periods=10)
df = pd.DataFrame(np.random.randn(10,3) , index = dates,columns=list('abc'))
df['d'] = pd.Series(np.random.randn(10),index=df.index)
参考博客
遍历数据
for index, row in data.iterrows(): # 获取每行的index、row
for col_name in data.columns:
row[col_name] = exp(row[col_name]) # 把指数结果返回给data
return data
读文件
报错:
UnicodeDecodeError: 'gbk' codec can't decode byte 0x80 in position 205: illegal multibyte sequence
解决:
FILE_OBJECT= open('order.log','r', encoding='UTF-8')
实例
import json
import pandas as pd
import pymongo
from conf import *
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
def get_data_frame():
with open('./data/xxx.dat', mode='r', encoding='utf-8') as f:
# key -> columns,value -> list
data_dict = {
'name' : [],
'contact_name' : [],
'price' : [],
'type_id' : [],
'contact_mobile' : []
}
# 每一行为person的信息
for line in f:
# 将str转成dict
data = json.loads(line)
if not data['price'] == 0.0:
data_dict['name'].append(data['name'])
data_dict['contact_name'].append(data['contact_name'])
data_dict['price'].append(data['price'])
data_dict['type_id'].append(data['type_id'])
data_dict['contact_mobile'].append(data['contact_mobile'])
# 创建DataFrame
columns = ['name', 'contact_name', 'contact_mobile', 'price', 'type_id']
index = [i for i in range(len(data_dict['name']))]
data_frame = pd.DataFrame(data_dict, columns=columns, index=index)
return data_frame
def get_type_frame():
# 数据格式 -> type::star
with open('./data/type_id.dat', mode='r', encoding='utf-8') as f:
type_dict = {
'type_name': [],
'star_name': []
}
for line in f:
line = line.split("::")
type_dict['type_name'].append(line[0])
type_dict['star_name'].append(line[1].strip())
columns = ['star_name', 'type_name']
index = [i for i in range(len(type_dict['type_name']))]
type_frame = pd.DataFrame(type_dict, columns=columns, index=index)
return type_frame
def main():
data_frame = get_data_frame()
type_frame = get_type_frame()
# 将两个df通过name/star_name进行拼接,得到data_frame.type_id和data_frame.type_name组成columns的type_id_frame
type_id_frame = pd.merge(
data_frame, type_frame, left_on='name', right_on='star_name', sort=False).ix[::, ['type_id', 'type_name']]
# 再通过type_id将type_id_frame插入data_frame
data_frame = pd.merge(data_frame, type_id_frame, on="type_id", sort=False)
# 通过type_id分类,计算得到每个类别的mean和count,插入type_id_frame
type_mean_list = []
type_count_list = []
for type_id in type_id_frame.type_id:
type_mean_list.append(data_frame.ix[data_frame.type_id == type_id, 'price'].mean())
type_count_list.append(data_frame.ix[data_frame.type_id == type_id, 'price'].count())
type_id_frame['type_mean'] = pd.Series(type_mean_list, index=type_id_frame.index)
type_id_frame['type_count'] = pd.Series(type_count_list, index=type_id_frame.index)
# 将type_id_frame根据type_count,type_mean排序之后,将每一行转成Dict,插入mongodb
for index, row in type_id_frame.sort_values(
by=['type_count', 'type_mean'], ascending=False).iterrows():
db[MONGO_TABLE].insert(row.to_dict())
print("ok...")
if __name__ == '__main__':
main()