weka发现dblp论文合著者关系-数据挖掘作业

最新推荐文章于 2024-03-20 20:57:43 发布

Firesuiry

最新推荐文章于 2024-03-20 20:57:43 发布

阅读量1.4k

点赞数 1

本文链接：https://blog.csdn.net/weixin_44387667/article/details/102717063

版权

dblp数据集下载地址：https://dblp.org/xml/

拿到数据集首先要把他变成csv格式的文件

xml转json格式：

import xml.sax
from xml.sax.handler import ContentHandler
from xml.sax import parse
import json

doc = open('out2.json', 'w' , encoding='utf-8')
json_str = ''
all_json = []


class article(xml.sax.ContentHandler):
    def __init__(self):
        self.CurrentData = ""
        self.author = ""
        self.title = ""
        self.pages = ""
        self.journal = ""
        self.authorIndex = 0
        self.authors = []
        self.authorNum = 0

    # 元素开始事件处理
    def startElement(self, tag, attributes):
        self.CurrentData = tag
        target = ['incollection',"article",'mastersthesis']
        if tag in target:
            if len(self.authors) is not 0:
                all_json.append(self.authors.copy())
                self.authors = []
            mdate = attributes["mdate"]
            key = attributes["key"]
            self.key = key


    # 元素结束事件处理
    def endElement(self, tag):
        if self.CurrentData == "author":
            self.authors.append(self.author)
            if len(self.authors) > self.authorNum:
                self.authorNum = len(self.authors)

        elif self.CurrentData == "title":
            # print("title:", self.title, file=doc)
            pass
        elif self.CurrentData == "journal":
            # print("journal:", self.journal, file=doc)
            pass
        elif self.CurrentData == "article":
            if len(self.authors) > self.authorNum:
                self.authorNum = len(self.authors)
            print('end' , len(self.authors),self.authorNum)
    # 内容事件处理
    def characters(self, content):
        if self.CurrentData == "author":
            self.author = content
        elif self.CurrentData == "title":
            self.title = content
        elif self.CurrentData == "journal":
            self.journal = content

    def addAuthor(self):
        self.authorIndex += 1
        return 'ra' + str(self.authorIndex)


if __name__ == "__main__":
    parser = xml.sax.make_parser()
    parser.setFeature(xml.sax.handler.feature_namespaces, 0)
    parser.setContentHandler(article())
    parser.parse('dblp.xml')
    json_str = json.dumps(all_json)
    print(json_str,file=doc)

doc.close()

然后json转csv

import json

json_file = open('out2.json', 'r' , encoding='utf-8')
doc_file = open('out.csv', 'w' , encoding='utf-8')
print('read')
json_str = json_file.read()
print('load')
ls = json.loads(json_str)
print('write')
for l in ls:
    if len(l) > 10:
        l = l[:9]
    while(True):
        if len(l) < 10:
            l.append('?')
        else:
            break


    assert len(l) == 10
    content = ','.join(str(author) for author in l)
    content = content.replace(r'"','')
    content = content.replace(r"'",'')
    content = content.replace(r" ",'')
    # print(content)
    print(content,file=doc_file)
json_file.close()
doc_file.close()

写的时候注意两点

1.每行的元素数量要一样，元素不一样的话weka会报错，我取了前十个作者名字

2.所有的单双引号要去掉，不然可能会报EOL错误