人物关系抽取——基于特征工程

6 篇文章 0 订阅

本文代码,不得转载。

# -*- coding: utf-8 -*-
# Author: lx
# extract features from the text

import pandas as pd
import numpy as np
from text1 import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import check_array
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from data_process import load_data_and_labels
from nltk.corpus import stopwords
import nltk
from nltk.parse.stanford import StanfordParser
from nltk.parse.stanford import StanfordDependencyParser
from nltk.parse.corenlp import CoreNLPParser
from nltk.parse.stanford import StanfordDependencyParser
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report

from imblearn.over_sampling import SMOTE

# 载入数据
trainFile = r'D:\file_download\BaiduNetdiskDownload\PyCharm_File\graduation\person_relation.txt'
# e1,e2: 位置索引, pos1,pos2: 相对位置,e1,e2为中心(100)
texts, raw_label, e1, e2, pos1, pos2 = load_data_and_labels(trainFile)

# 分词
def token(texts):
    token = []
    for text_raw in texts:
        text = nltk.word_tokenize(text_raw)
        token.append(text)
    return token

# 词性标注,先用list保存
def pos(texts):
    rfiltered_list = []
    for text_raw in texts:
        text = nltk.word_tokenize(text_raw)
        # 去掉标点符号
        # english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
        # text = [w for w in text if w not in english_punctuations]
        # 去停用词??还是不去??
        # filtered = [w for w in text if w not in stopwords.words('english')]
        rfiltered = nltk.pos_tag(text)
        rfiltered_list.append(rfiltered)
    return rfiltered_list

# 命名实体识别,结果为所有句子识别后的tree对象组成的list
def ner(texts):
    # ner_list = []
    # for text in pos(texts):
    #     tree_list = []
    #     for tree in nltk.ne_chunk(text, binary=False).subtrees():
    #         # 过滤根树
    #         if tree.label() == "S":
    #             continue
    #         tree_list.append(tree)
    #     ner_list.append(tree_list)
    # 不过滤根树,由所有tree对象组成的list
    ner_list = []
    for text in pos(texts):
        ner_list.append(nltk.ne_chunk(text, binary=False))
    return ner_list

# 句法分析+依存句法分析,结果为所有句子分析后的tree对象组成的list
def parser(texts):
    # standfordnlp有问题
    parser = CoreNLPParser(r'')
    parser_list = []
    for text in texts:
        parse_result = parser.parse(nltk.word_tokenize(text))
        # print parse_result
        parser_list.append(parse_result)
    return parser_list

# 构造特征向量
def featurevector(texts):
    # 各维数据
    x = []
    # 词汇特征
    # f1,e1词性; f2,e2词性# f3,e1前一个词,f4,e2前一个词# f5,e1前一个词词性,f6,e2前一个词的词性,
    # f7,e1前第二个词; f8,e2前第二个词; f9,e1前第二个词词性; f10,e2前第二个词词性;

    for i in range(0,len(pos(texts))):
        x1 = (pos(texts)[i][e1[i]])[1]
        x2 = (pos(texts)[i][e2[i]])[1]
        if e1[i] == 0:
            x3 = 'null'
            x5 = 'null'
            x7 = 'null'
            x9 = 'null'
        else:
            x3 = token(texts)[i][e1[i]-1]
            x5 = (pos(texts)[i][e1[i]-1])[1]
            x7 = token(texts)[i][e1[i]-2]
            x9 = (pos(texts)[i][e1[i]-2])[1]
        x4 = token(texts)[i][e2[i]-1]
        x6 = (pos(texts)[i][e2[i]-1])[1]
        x8 = token(texts)[i][e2[i]-2]
        x10 = (pos(texts)[i][e2[i]-2])[1]
        # f11,位置特征:此处用实体间的距离
        x11 = int(e2[i] - e1[i])
        # print [x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11]
        print i
        x.append([x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11])
    feature = pd.DataFrame(data=x, columns=["x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11"])
    return feature

def tansx(feature):
    estimator = PCA(n_components=2)
    x_pca = estimator.fit_transform(feature)
    return x_pca


    # x2 = ner(texts)  # 2,命名实体
    # x3 = parser(texts)  # 3,句法分析

smo = SMOTE(random_state=42)
dict_vec = DictVectorizer(sparse=False)
feature = dict_vec.fit_transform(featurevector(texts).to_dict(orient='record'))

print(dict_vec.feature_names_)
X_smo, y_smo = smo.fit_sample(np.array(feature).reshape(400,-1), np.array(raw_label).reshape(400,-1))

x_train, x_test, y_train, y_test = train_test_split(X_smo, y_smo, train_size=0.8, random_state=33)
# x_train, x_test, y_train, y_test = train_test_split(featurevector(texts), raw_label, train_size=0.8, random_state=33)
# wordVectorizer = CountVectorizer(min_df=3, token_pattern="\t", ngram_range=(1, 2))
# wordVectorizer = CountVectorizer(ngram_range=(1, 2))
# train_feature = wordVectorizer.fit_transform(x_train)
# train_feature = tansx(train_feature)
# # wordTransformer = TfidfTransformer()
# # train_feature = wordTransformer.fit_transform(train_feature)
# # test_feature = wordTransformer.transform(wordVectorizer.transform(x_test))
# test_feature = wordVectorizer.fit_transform(x_test)
# # onevsrest = OneVsRestClassifier(LogisticRegression(C = 1, tol = 0.01))
# dict_vec = DictVectorizer(sparse=False)
# X_train = dict_vec.fit_transform(x_train.to_dict(orient='record'))
# X_test = dict_vec.transform(x_test.to_dict(orient='record'))
#
# print(dict_vec.feature_names_)

train_feature = x_train
test_feature = x_test
onevsrest = OneVsRestClassifier(LinearSVC(C=1, tol=0.0001, dual=True))
onevsrest.fit(train_feature, y_train)
y_pred = onevsrest.predict(test_feature)
print "the mean accuracy:"
print onevsrest.score(test_feature, y_test)

print "详细的评估指标:"
print classification_report(y_pred, y_test)



predictions = onevsrest.decision_function(test_feature)
predictions[predictions == 0] = - np.inf
# predictions = onevsrest.predict_proba(test_feature)

acc_list = [0.0] * 10
for prediction, label in zip(predictions, y_test):
    topN = np.argsort(prediction)[-10:]


    for i in range(1, 11):
        if label[topN[-i:]].sum() > 0:
            acc_list[i - 1] += 1

for i, acc in enumerate(acc_list, 1):
    print i, acc / len(y_test)


# if __name__ == "__main__":

    # print pos(texts)
    # print ner(texts)
    # print ner(texts)[0].draw()
    # for t in tree:
    #     print t
    #     print t.draw()
    # text = "After repeating the 35-word oath of office, Trump stretched his arms wide and hugged his wife, Melania, and other members of his family"
    # text = "This tradition has been narrated in prophet's early biographies"
    # # 参数分别是jar包和model
    # parser = StanfordParser(r'D:\file_download\BaiduNetdiskDownload\standford-nlp\stanford-parser-full-2015-12-09\stanford-parser-full-2015-12-09\stanford-parser.jar', r'D:\file_download\BaiduNetdiskDownload\standford-nlp\stanford-parser-full-2015-12-09\stanford-parser-full-2015-12-09\stanford-parser-3.6.0-models.jar')
    # # parse_result = parser.parse(nltk.word_tokenize(text))
    # # for i in parse_result:
    # #     print i
    # # for i in list(parse_result):
    # #     i.draw()
    # lexparser = StanfordDependencyParser(r'D:\file_download\BaiduNetdiskDownload\standford-nlp\stanford-parser-full-2015-12-09\stanford-parser-full-2015-12-09\stanford-parser.jar', r'D:\file_download\BaiduNetdiskDownload\standford-nlp\stanford-parser-full-2015-12-09\stanford-parser-full-2015-12-09\stanford-parser-3.6.0-models.jar')
    # parse_result2 = lexparser.parse(nltk.word_tokenize(text))
    # for i in list(parse_result2)[0].triples():
    #     print i

    # from nltk.tag import StanfordPOSTagger
    # nltk.internals.config_java(bin=r"D:/Java/bin/java")
    # eng_tagger = StanfordPOSTagger(model_filename=r'D:/file_download\BaiduNetdiskDownload/standford-nlp/stanford-postagger-full-2015-12-09/stanford-postagger-full-2015-12-09/models/english-bidirectional-distsim.tagger',path_to_jar=r'D:/file_download/BaiduNetdiskDownload/standford-nlp/stanford-postagger-full-2015-12-09/stanford-postagger-full-2015-12-09/stanford-postagger.jar')
    # print(eng_tagger.tag('What is the airspeed of an unladen swallow'.split()))
    # print featurevector(texts)
    # print token(texts)
    # tex = "This tradition has been narrated in prophet's early biographies"
    # tok = nltk.word_tokenize(tex)
    # print nltk.pos_tag(tok)
    # print nltk.ne_chunk(nltk.pos_tag(tok), binary=False)
    # print nltk.ne_chunk(nltk.pos_tag(tok), binary=False).draw()

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值