本文代码,不得转载。
# -*- coding: utf-8 -*-
# Author: lx
# extract features from the text
import pandas as pd
import numpy as np
from text1 import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import check_array
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from data_process import load_data_and_labels
from nltk.corpus import stopwords
import nltk
from nltk.parse.stanford import StanfordParser
from nltk.parse.stanford import StanfordDependencyParser
from nltk.parse.corenlp import CoreNLPParser
from nltk.parse.stanford import StanfordDependencyParser
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
# 载入数据
trainFile = r'D:\file_download\BaiduNetdiskDownload\PyCharm_File\graduation\person_relation.txt'
# e1,e2: 位置索引, pos1,pos2: 相对位置,e1,e2为中心(100)
texts, raw_label, e1, e2, pos1, pos2 = load_data_and_labels(trainFile)
# 分词
def token(texts):
token = []
for text_raw in texts:
text = nltk.word_tokenize(text_raw)
token.append(text)
return token
# 词性标注,先用list保存
def pos(texts):
rfiltered_list = []
for text_raw in texts:
text = nltk.word_tokenize(text_raw)
# 去掉标点符号
# english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
# text = [w for w in text if w not in english_punctuations]
# 去停用词??还是不去??
# filtered = [w for w in text if w not in stopwords.words('english')]
rfiltered = nltk.pos_tag(text)
rfiltered_list.append(rfiltered)
return rfiltered_list
# 命名实体识别,结果为所有句子识别后的tree对象组成的list
def ner(texts):
# ner_list = []
# for text in pos(texts):
# tree_list = []
# for tree in nltk.ne_chunk(text, binary=False).subtrees():
# # 过滤根树
# if tree.label() == "S":
# continue
# tree_list.append(tree)
# ner_list.append(tree_list)
# 不过滤根树,由所有tree对象组成的list
ner_list = []
for text in pos(texts):
ner_list.append(nltk.ne_chunk(text, binary=False))
return ner_list
# 句法分析+依存句法分析,结果为所有句子分析后的tree对象组成的list
def parser(texts):
# standfordnlp有问题
parser = CoreNLPParser(r'')
parser_list = []
for text in texts:
parse_result = parser.parse(nltk.word_tokenize(text))
# print parse_result
parser_list.append(parse_result)
return parser_list
# 构造特征向量
def featurevector(texts):
# 各维数据
x = []
# 词汇特征
# f1,e1词性; f2,e2词性# f3,e1前一个词,f4,e2前一个词# f5,e1前一个词词性,f6,e2前一个词的词性,
# f7,e1前第二个词; f8,e2前第二个词; f9,e1前第二个词词性; f10,e2前第二个词词性;
for i in range(0,len(pos(texts))):
x1 = (pos(texts)[i][e1[i]])[1]
x2 = (pos(texts)[i][e2[i]])[1]
if e1[i] == 0:
x3 = 'null'
x5 = 'null'
x7 = 'null'
x9 = 'null'
else:
x3 = token(texts)[i][e1[i]-1]
x5 = (pos(texts)[i][e1[i]-1])[1]
x7 = token(texts)[i][e1[i]-2]
x9 = (pos(texts)[i][e1[i]-2])[1]
x4 = token(texts)[i][e2[i]-1]
x6 = (pos(texts)[i][e2[i]-1])[1]
x8 = token(texts)[i][e2[i]-2]
x10 = (pos(texts)[i][e2[i]-2])[1]
# f11,位置特征:此处用实体间的距离
x11 = int(e2[i] - e1[i])
# print [x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11]
print i
x.append([x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11])
feature = pd.DataFrame(data=x, columns=["x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11"])
return feature
def tansx(feature):
estimator = PCA(n_components=2)
x_pca = estimator.fit_transform(feature)
return x_pca
# x2 = ner(texts) # 2,命名实体
# x3 = parser(texts) # 3,句法分析
smo = SMOTE(random_state=42)
dict_vec = DictVectorizer(sparse=False)
feature = dict_vec.fit_transform(featurevector(texts).to_dict(orient='record'))
print(dict_vec.feature_names_)
X_smo, y_smo = smo.fit_sample(np.array(feature).reshape(400,-1), np.array(raw_label).reshape(400,-1))
x_train, x_test, y_train, y_test = train_test_split(X_smo, y_smo, train_size=0.8, random_state=33)
# x_train, x_test, y_train, y_test = train_test_split(featurevector(texts), raw_label, train_size=0.8, random_state=33)
# wordVectorizer = CountVectorizer(min_df=3, token_pattern="\t", ngram_range=(1, 2))
# wordVectorizer = CountVectorizer(ngram_range=(1, 2))
# train_feature = wordVectorizer.fit_transform(x_train)
# train_feature = tansx(train_feature)
# # wordTransformer = TfidfTransformer()
# # train_feature = wordTransformer.fit_transform(train_feature)
# # test_feature = wordTransformer.transform(wordVectorizer.transform(x_test))
# test_feature = wordVectorizer.fit_transform(x_test)
# # onevsrest = OneVsRestClassifier(LogisticRegression(C = 1, tol = 0.01))
# dict_vec = DictVectorizer(sparse=False)
# X_train = dict_vec.fit_transform(x_train.to_dict(orient='record'))
# X_test = dict_vec.transform(x_test.to_dict(orient='record'))
#
# print(dict_vec.feature_names_)
train_feature = x_train
test_feature = x_test
onevsrest = OneVsRestClassifier(LinearSVC(C=1, tol=0.0001, dual=True))
onevsrest.fit(train_feature, y_train)
y_pred = onevsrest.predict(test_feature)
print "the mean accuracy:"
print onevsrest.score(test_feature, y_test)
print "详细的评估指标:"
print classification_report(y_pred, y_test)
predictions = onevsrest.decision_function(test_feature)
predictions[predictions == 0] = - np.inf
# predictions = onevsrest.predict_proba(test_feature)
acc_list = [0.0] * 10
for prediction, label in zip(predictions, y_test):
topN = np.argsort(prediction)[-10:]
for i in range(1, 11):
if label[topN[-i:]].sum() > 0:
acc_list[i - 1] += 1
for i, acc in enumerate(acc_list, 1):
print i, acc / len(y_test)
# if __name__ == "__main__":
# print pos(texts)
# print ner(texts)
# print ner(texts)[0].draw()
# for t in tree:
# print t
# print t.draw()
# text = "After repeating the 35-word oath of office, Trump stretched his arms wide and hugged his wife, Melania, and other members of his family"
# text = "This tradition has been narrated in prophet's early biographies"
# # 参数分别是jar包和model
# parser = StanfordParser(r'D:\file_download\BaiduNetdiskDownload\standford-nlp\stanford-parser-full-2015-12-09\stanford-parser-full-2015-12-09\stanford-parser.jar', r'D:\file_download\BaiduNetdiskDownload\standford-nlp\stanford-parser-full-2015-12-09\stanford-parser-full-2015-12-09\stanford-parser-3.6.0-models.jar')
# # parse_result = parser.parse(nltk.word_tokenize(text))
# # for i in parse_result:
# # print i
# # for i in list(parse_result):
# # i.draw()
# lexparser = StanfordDependencyParser(r'D:\file_download\BaiduNetdiskDownload\standford-nlp\stanford-parser-full-2015-12-09\stanford-parser-full-2015-12-09\stanford-parser.jar', r'D:\file_download\BaiduNetdiskDownload\standford-nlp\stanford-parser-full-2015-12-09\stanford-parser-full-2015-12-09\stanford-parser-3.6.0-models.jar')
# parse_result2 = lexparser.parse(nltk.word_tokenize(text))
# for i in list(parse_result2)[0].triples():
# print i
# from nltk.tag import StanfordPOSTagger
# nltk.internals.config_java(bin=r"D:/Java/bin/java")
# eng_tagger = StanfordPOSTagger(model_filename=r'D:/file_download\BaiduNetdiskDownload/standford-nlp/stanford-postagger-full-2015-12-09/stanford-postagger-full-2015-12-09/models/english-bidirectional-distsim.tagger',path_to_jar=r'D:/file_download/BaiduNetdiskDownload/standford-nlp/stanford-postagger-full-2015-12-09/stanford-postagger-full-2015-12-09/stanford-postagger.jar')
# print(eng_tagger.tag('What is the airspeed of an unladen swallow'.split()))
# print featurevector(texts)
# print token(texts)
# tex = "This tradition has been narrated in prophet's early biographies"
# tok = nltk.word_tokenize(tex)
# print nltk.pos_tag(tok)
# print nltk.ne_chunk(nltk.pos_tag(tok), binary=False)
# print nltk.ne_chunk(nltk.pos_tag(tok), binary=False).draw()