读写 CSV 、pickle文件

最新推荐文章于 2024-04-19 22:00:43 发布

sisteryaya

最新推荐文章于 2024-04-19 22:00:43 发布

阅读量1.9k

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/sisteryaya/article/details/76906980

版权

python 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

1、将数据写入 CSV 文件时，写入头文件：

    with open('/Users/suhong/Desktop/tea3.csv', 'w',newline='') as csvfile:
        fieldnames = ['name', 'longitude', 'latitude']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerow({'name': place, 'longitude': str(lng_lat[0]), 'latitude': str(lng_lat[1])})

参数 newline 是用来控制文本模式之下，一行的结束字符，可以使None, ' ', \n, \r ,等若不加 newline=‘ ’，则存储的数据每一条之间都有空行

若存入数据汉字较多，还可加入 encoding = 'utf-8'

2、写入新的文件，不要索引：

df.to_csv('tea _test2.csv',index=False)  # 写入新的文件  不要索引

3、分块读

# load the dataset
path_file = '../data/data.csv'
reader = pd.read_csv(path_file,
                     iterator=True, #返回一个TextFileReader 对象，以便逐块处理文件
                     header=None, # 不读列名
                     parse_dates=[2], #解析第2列
                     names=['user_id', 'shop_id', 'time_stamp'])

loop = True
chunk_size = 100000 #文件块的大小
chunks = []
while loop:
    try:
        chunk = reader.get_chunk(chunk_size)
        chunks.append(chunk)
    except StopIteration:
        loop = False
        print('Iteration is stopped.')
full_data = pd.concat(chunks, ignore_index=True)

4、带日期的文件，读入是解析日期

dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m-%d %H') # 同pd.to_datetime() 2014-12-16 18 - 2014-12-16 18:00:00
for df in pd.read_csv(open(path_file, 'r'), 
                      parse_dates=['time'],  # parse_dates=True:可令字符串解析成时间格式。20090101 - 2009-01-01
                      index_col = ['time'],  # 设置索引列
                      date_parser = dateparse, # 解析日期的函数
                      chunksize = 100000):  # 文件块的大小 operation on chunk as the data file is too large
    
    df_part_1     = df['2014-11-22':'2014-11-27']
    df_part_1_label = df['2014-11-28']
        
    df_part_1.to_csv(path_file_1,  
                    columns=['user_id','item_id','behavior_type','item_category'], # 写入这些列
                    header=False, mode='a')      # header= false：不写列名 ；'a':追加

5、文件过大时，也可分块训练

for pred_df in pd.read_csv(open(path_file, 'r'), chunksize = 100000):
    #predict
    pred_df['pred_label'] = bst.predict(pred_df[col], num_iteration=bst.best_iteration)
    #save
    pred_df[pred_df['pred_label'] == 1].to_csv(path_result,
                                               columns=['user_id', 'item_id'],
                                               index=False, header=False, mode='a') # 不要索引、不要列名、追加写入

6、

#记录已存在的date.csv
date_dictionary = {}
   
#将words写入date.csv文件最后一行，文件打开采用'a'模式，即在原文件后添加（add）
def writeByDate(date,words):
    file_name = date+".csv"
    os.chdir('../data/date/')  # os.chdir(path) 方法用于改变当前工作目录到指定的路径。
    if not date_dictionary.__contains__(date):  # date_dictionary.__contains__(date): dict中是否包含键“date”,不包含则执行操作
        date_dictionary[date] = True
        f = open(file_name,'a',newline='')
        write = csv.writer(f)
        write.writerow(['user_id','item_id','behavior_type','user_geohash','item_category','hour'])
        write.writerow(words)
        f.close()
    else:
        f = open(file_name,'a',newline='')
        write = csv.writer(f)
        write.writerow(words)
        f.close()
    os.chdir('../../preprocess/')

7、将多个CSV文件放在同一个文件夹中，逐个读取，进行操作，以字典形式保存

def genUidIid():
	os.mkdir("../data/dictionary/date")
	direction = "../data/date/"
	file_list = os.listdir(direction)
	for file_name in file_list:
		file_path = direction+file_name
		# f = open(file_path,'rb')  # 打开每一个日期文件
		f = open(file_path)  # 打开每一个日期文件
		rows = csv.reader(f)
		next(rows)
		dictionary = {} # eg: {('101266396', '10001082'): [[1, 0, 0, 0], [''], ['5932'], [22]], ... }
		for row in rows:
			sample =(row[0],row[1])  # Attention: tuple is hashable,but list is not hashable
			if dictionary.__contains__(sample):
				dictionary[sample][0][int(row[2])-1]  += 1
				# 判断当前地理位置geohash是否已经存在；如果一个商品在几个地址下(或者为空)，将所有地址集在一起   ['9614', ' 1234',' ']
				if dictionary[sample][1].count(row[3]) == 0:
					dictionary[sample][1].append(row[3])

				# sample1 ，新的行 sample2 与其u，i相同，category不同，则此处是判断新的 sample2 的类别在原来的 sample1 中是否已经存在了，不存在（count计数为0）则写入sample[2] ，存在，则count计数为1
				if dictionary[sample][2].count(row[4]) == 0:
					dictionary[sample][2].append(row[4])
					dictionary[sample][3].append(int(row[5]))
			else:
				dictionary[sample]=[[0,0,0,0],[row[3]],[row[4]],[int(row[5])]]
				dictionary[sample][0][int(row[2])-1] = 1
			dictionary[sample][3].sort()
		f.close()

		f = open("../data/dictionary/date/"+file_name.split('.')[0]+".pkl",'wb')
		pickle.dump(dictionary,f,-1)  # save，  -1 为最高协议
		f.close()

8、保存、读取pickle文件

import pickle
import csv

f = open("../data/dictionary/uidfeature.pkl", 'wb')
pickle.dump(dictionary, f, -1)
f.close()

item = pickle.load(open("../data/dictionary//uidfeature.pkl","rb"))

9、