import json
import pandas as pd
def print_csv(src_csv,tgt_csv):
#csv_file = open(src_csv,encoding='utf-8')
g = open(tgt_csv, 'w', encoding='utf-8')
reader = pd.read_csv(src_file, iterator=True)
loop = True
chunkSize = 500000 #文件的分割行数
num_line = 0
sum_line = 0 #记录总行数
while loop:
try:
if sum_line % 500000 == 0:
print("第%d行" % sum_line)
lines = reader.get_chunk(chunkSize)
#print(lines) #用小文件输出,查看结果
for i in range(len(lines)):
real_line = num_line * chunkSize + i
dic = {'id':'', 'text':''}
sum_line += 1
num = lines['classification'][real_line] #IPC分类号
abs = lines['abs'][real_line] #专利摘要
if str(num)=='nan' or str(abs)=='nan':
continue #若为空,则跳过
#print(num)
#print(abs)
dic['id'] = num
dic['text'] = abs
json_data = json.dumps(dic, ensure_ascii=False)
g.write(json_data + '\n')
num_line += 1
except StopIteration:
loop = False
print("Iteration is stopped")
g.close()
if __name__ == '__main__':
src_csv = 'patent.csv'
tgt_csv = 'new_patent.json'
print_csv(src_csv, tgt_csv)
pandas读取2000w行csv文件
最新推荐文章于 2023-04-04 12:23:47 发布