这是用于https://github.com/mandieq/eco-frame-public的数据处理
import xyq_nlp
import txt_related
import pandas as pd
import numpy as np
import csv
import nltk
from nltk.parse import CoreNLPParser
from collections import Counter
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize, word_tokenize
ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
texts = txt_related.returnContent() #获得文本内容
all = Counter()
eachChapter = []
for text in texts: #每章需要拆分为各个句子
sents = sent_tokenize(text)
characters = []
for sent in sents: #对各个句子进行分词实体识别
tags = ner_tagger.tag(word_tokenize(sent))
character = [tag[0] for tag in tags if tag[1] == "PERSON"] #只对PERSON进行识别
characters = characters + character
all.update(characters) #对整体的人进行分词
li = Counter(characters) #对每章的角色进行实体提取
# print(li)
eachChapter.append(li.most_common())
f = open('data.csv', 'a+', encoding='utf-8') #将算出的人物频率图放到csv里
csv_writer = csv.writer(f)
data = []
title = txt_related.returnChapter()
csv_writer.writerow(['CHAPTER']+title)
print(len(title))
#title.insert(0, 'CHAPTER')
#csv_writer.writerow(title)
tops = all.most_common() #出现次数最多的7个人
# print(top7)
tops = [top[0] for top in tops]
for chapter in range(len(eachChapter)): #输出top7个人里面有没有在某章中出现
names = [name[0] for name in eachChapter[chapter]]
times = [time[1] for time in eachChapter[chapter]]
calTime = []
for i in range(len(tops)):
if tops[i] in names:
calTime.append(times[names.index(tops[i])])
else:
calTime.append(0)
data.append(calTime)
datas = list(zip(*data))#将矩阵转置 这样可以得到目标数据
print(datas)
count = 0
for data in datas:
temp = list(data)
csv_writer.writerow([tops[count]]+temp)
count = count+1
用于heatmap的数据处理:
ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
texts = txt_related.returnContent() #获得文本内容
all = Counter()
eachChapter = []
for text in texts: #每章需要拆分为各个句子
sents = sent_tokenize(text)
characters = []
for sent in sents: #对各个句子进行分词实体识别
tags = ner_tagger.tag(word_tokenize(sent))
character = [tag[0] for tag in tags if tag[1] == "PERSON"] #只对PERSON进行识别
characters = characters + character
all.update(characters) #对整体的人进行分词
li = Counter(characters) #对每章的角色进行实体提取
# print(li)
eachChapter.append(li.most_common())
f = open('datas.csv', 'a+', encoding='utf-8') #将算出的人物频率图放到csv里
csv_writer = csv.writer(f)
data = []
title = txt_related.returnChapter()
tops = all.most_common(7) #出现次数最多的7个人
# print(top7)
tops = [top[0] for top in tops]
csv_writer.writerow(tops)
count = 0
for chapter in eachChapter: #输出top7个人里面有没有在某章中出现
names = [name[0] for name in chapter]
times = [time[1] for time in chapter]
calTime = []
for i in range(len(tops)):
if tops[i] in names:
calTime.append(times[names.index(tops[i])])
else:
calTime.append(0)
# data.append(calTime)
calTime.insert(0, title[count])
count = count+1
csv_writer.writerow(calTime)