dblp数据集下载地址:https://dblp.org/xml/
拿到数据集首先要把他变成csv格式的文件
xml转json格式:
import xml.sax
from xml.sax.handler import ContentHandler
from xml.sax import parse
import json
doc = open('out2.json', 'w' , encoding='utf-8')
json_str = ''
all_json = []
class article(xml.sax.ContentHandler):
def __init__(self):
self.CurrentData = ""
self.author = ""
self.title = ""
self.pages = ""
self.journal = ""
self.authorIndex = 0
self.authors = []
self.authorNum = 0
# 元素开始事件处理
def startElement(self, tag, attributes):
self.CurrentData = tag
target = ['incollection',"article",'mastersthesis']
if tag in target:
if len(self.authors) is not 0:
all_json.append(self.authors.copy())
self.authors = []
mdate = attributes["mdate"]
key = attributes["key"]
self.key = key
# 元素结束事件处理
def endElement(self, tag):
if self.CurrentData == "author":
self.authors.append(self.author)
if len(self.authors) > self.authorNum:
self.authorNum = len(self.authors)
elif self.CurrentData == "title":
# print("title:", self.title, file=doc)
pass
elif self.CurrentData == "journal":
# print("journal:", self.journal, file=doc)
pass
elif self.CurrentData == "article":
if len(self.authors) > self.authorNum:
self.authorNum = len(self.authors)
print('end' , len(self.authors),self.authorNum)
# 内容事件处理
def characters(self, content):
if self.CurrentData == "author":
self.author = content
elif self.CurrentData == "title":
self.title = content
elif self.CurrentData == "journal":
self.journal = content
def addAuthor(self):
self.authorIndex += 1
return 'ra' + str(self.authorIndex)
if __name__ == "__main__":
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0)
parser.setContentHandler(article())
parser.parse('dblp.xml')
json_str = json.dumps(all_json)
print(json_str,file=doc)
doc.close()
然后json转csv
import json
json_file = open('out2.json', 'r' , encoding='utf-8')
doc_file = open('out.csv', 'w' , encoding='utf-8')
print('read')
json_str = json_file.read()
print('load')
ls = json.loads(json_str)
print('write')
for l in ls:
if len(l) > 10:
l = l[:9]
while(True):
if len(l) < 10:
l.append('?')
else:
break
assert len(l) == 10
content = ','.join(str(author) for author in l)
content = content.replace(r'"','')
content = content.replace(r"'",'')
content = content.replace(r" ",'')
# print(content)
print(content,file=doc_file)
json_file.close()
doc_file.close()
写的时候注意两点
1.每行的元素数量要一样,元素不一样的话weka会报错,我取了前十个作者名字
2.所有的单双引号要去掉,不然可能会报EOL错误