本文代码,不得转载。
# -*- coding: utf-8 -*-
# Author: lx
# extract features from the text
import pandas as pd
import numpy as np
from text1 import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import check_array
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from data_process import load_data_and_labels
from nltk.corpus import stopwords
import nltk
from nltk.parse.stanford import StanfordParser
from nltk.parse.stanford import StanfordDependencyParser
from nltk.parse.corenlp import CoreNLPParser
from nltk.parse.stanford import StanfordDependencyParser
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
# 载入数据
trainFile = r'D:\file_download\BaiduNetdiskDownload\PyCharm_File\graduation\person_relation.txt'
# e1,e2: 位置索引, pos1,pos2: 相对位置,e1,e2为中心(100)
texts, raw_label, e1, e2, pos1, pos2 = load_data_and_labels(trainFile)
# 分词
def token(texts):
token = []
for text_raw in texts:
text = nltk.word_tokenize(text_raw)
token.append(text)
return token
# 词性标注,先用list保存
def pos(texts):
rfiltered_list = []
for text_raw in texts:
text = nltk.word_tokenize(text_raw)
# 去掉标点符号
# english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '