pandas读取2000w行csv文件

最新推荐文章于 2023-04-04 12:23:47 发布

想念@思恋

最新推荐文章于 2023-04-04 12:23:47 发布

阅读量715

点赞数 1

分类专栏： python编程

本文链接：https://blog.csdn.net/tailonh/article/details/107799273

版权

python编程专栏收录该内容

136 篇文章 10 订阅

订阅专栏

import json
import pandas as pd
def print_csv(src_csv,tgt_csv):
	#csv_file = open(src_csv,encoding='utf-8')
	g = open(tgt_csv, 'w', encoding='utf-8')
	reader = pd.read_csv(src_file, iterator=True)
	loop = True
	chunkSize = 500000 #文件的分割行数
	num_line = 0
	sum_line = 0 #记录总行数
	while loop:
		try:
			if sum_line % 500000 == 0:
				print("第%d行" % sum_line)
			lines = reader.get_chunk(chunkSize)
			#print(lines) #用小文件输出，查看结果
			for i in range(len(lines)):
				real_line = num_line * chunkSize + i
				dic = {'id':'', 'text':''}
				sum_line += 1
				num = lines['classification'][real_line] #IPC分类号
				abs = lines['abs'][real_line] #专利摘要
				
				if str(num)=='nan' or str(abs)=='nan':
					continue #若为空，则跳过
				#print(num)
				#print(abs)
				dic['id'] = num
				dic['text'] = abs
				json_data = json.dumps(dic, ensure_ascii=False)
				g.write(json_data + '\n')
			num_line += 1
		except StopIteration:
			loop = False
			print("Iteration is stopped")
	g.close()

if __name__ == '__main__':
	src_csv = 'patent.csv'
	tgt_csv = 'new_patent.json'
	print_csv(src_csv, tgt_csv)