人物关系抽取——基于特征工程

本文代码,不得转载。

# -*- coding: utf-8 -*-
# Author: lx
# extract features from the text

import pandas as pd
import numpy as np
from text1 import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import check_array
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from data_process import load_data_and_labels
from nltk.corpus import stopwords
import nltk
from nltk.parse.stanford import StanfordParser
from nltk.parse.stanford import StanfordDependencyParser
from nltk.parse.corenlp import CoreNLPParser
from nltk.parse.stanford import StanfordDependencyParser
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report

from imblearn.over_sampling import SMOTE

# 载入数据
trainFile = r'D:\file_download\BaiduNetdiskDownload\PyCharm_File\graduation\person_relation.txt'
# e1,e2: 位置索引, pos1,pos2: 相对位置,e1,e2为中心(100)
texts, raw_label, e1, e2, pos1, pos2 = load_data_and_labels(trainFile)

# 分词
def token(texts):
    token = []
    for text_raw in texts:
        text = nltk.word_tokenize(text_raw)
        token.append(text)
    return token

# 词性标注,先用list保存
def pos(texts):
    rfiltered_list = []
    for text_raw in texts:
        text = nltk.word_tokenize(text_raw)
        # 去掉标点符号
        # english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值