# -*- coding: utf-8 -*-
import json
def each_chunk(stream, separator):
buffer = ''
while True: # until EOF
chunk = stream.read(4096) # I propose 4096 or so
if not chunk: # EOF?
yield buffer
break
buffer += chunk
while True: # until no separator is found
try:
part, buffer = buffer.split(separator, 1)
except ValueError:
break
else:
yield part+'}'
paper = ''
papers = []
temp = []
with open('data/dblpv13.json', 'r',encoding='utf-8') as myFile:
for i,chunk in enumerate(each_chunk(myFile, separator='}')):
print('--'*15)
print(chunk)
if "title" in chunk:
if i > 1:
paper_temp = paper.strip('[').strip(',').strip('\n').replace("NumberInt(", "").replace(')','')
paper_temp = json.loads(paper_temp)
papers.append(paper_temp)
paper = ''
paper += chunk
else:
paper += chunk
if i >3000:
break
【1】数据集为DBLP-Citation-network V13