【NLP】命名实体提取

这是用于https://github.com/mandieq/eco-frame-public的数据处理

import xyq_nlp
import txt_related
import pandas as pd
import numpy as np
import csv
import nltk
from nltk.parse import CoreNLPParser
from collections import Counter
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize, word_tokenize

ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
texts = txt_related.returnContent() #获得文本内容
all = Counter()
eachChapter = []
for text in texts: #每章需要拆分为各个句子
    sents = sent_tokenize(text)
    characters = []
    for sent in sents: #对各个句子进行分词实体识别
        tags = ner_tagger.tag(word_tokenize(sent))
        character = [tag[0] for tag in tags if tag[1] == "PERSON"] #只对PERSON进行识别
        characters = characters + character
    all.update(characters) #对整体的人进行分词
    li = Counter(characters) #对每章的角色进行实体提取
#        print(li)
    eachChapter.append(li.most_common())

f = open('data.csv', 'a+', encoding='utf-8') #将算出的人物频率图放到csv里
csv_writer = csv.writer(f)
data = []
title = txt_related.returnChapter()
csv_writer.writerow(['CHAPTER']+title)
print(len(title))
#title.insert(0, 'CHAPTER')
#csv_writer.writerow(title)
tops = all.most_common() #出现次数最多的7个人
#    print(top7)
tops = [top[0] for top in tops]

for chapter in range(len(eachChapter)): #输出top7个人里面有没有在某章中出现
    names = [name[0] for name in eachChapter[chapter]]
    times = [time[1] for time in eachChapter[chapter]]
    calTime = []
    for i in range(len(tops)):
        if tops[i] in names:
            calTime.append(times[names.index(tops[i])])
        else:
            calTime.append(0)
    data.append(calTime)
datas = list(zip(*data))#将矩阵转置 这样可以得到目标数据
print(datas)
count = 0
for data in datas:
    temp = list(data)
    csv_writer.writerow([tops[count]]+temp)
    count = count+1

用于heatmap的数据处理:

    ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
    texts = txt_related.returnContent() #获得文本内容
    all = Counter()
    eachChapter = []
    for text in texts: #每章需要拆分为各个句子
        sents = sent_tokenize(text)
        characters = []
        for sent in sents: #对各个句子进行分词实体识别
            tags = ner_tagger.tag(word_tokenize(sent))
            character = [tag[0] for tag in tags if tag[1] == "PERSON"] #只对PERSON进行识别
            characters = characters + character
        all.update(characters) #对整体的人进行分词
        li = Counter(characters) #对每章的角色进行实体提取
#        print(li)
        eachChapter.append(li.most_common())

    f = open('datas.csv', 'a+', encoding='utf-8') #将算出的人物频率图放到csv里
    csv_writer = csv.writer(f)
    data = []
    title = txt_related.returnChapter()
    tops = all.most_common(7) #出现次数最多的7个人
#    print(top7)
    tops = [top[0] for top in tops]
    csv_writer.writerow(tops)
    count = 0
    for chapter in eachChapter: #输出top7个人里面有没有在某章中出现
        names = [name[0] for name in chapter]
        times = [time[1] for time in chapter]
        calTime = []
        for i in range(len(tops)):
            if tops[i] in names:
                calTime.append(times[names.index(tops[i])])
            else:
                calTime.append(0)
#        data.append(calTime)
        calTime.insert(0, title[count])
        count = count+1
        csv_writer.writerow(calTime)

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值