Python代码用多种方式实现识别文本风格和作者数量(SVM,LogicRegression)
支持向量机(SVM)是一种流行的机器学习算法,广泛用于分类和回归任务。近年来,支持向量机在作者归属和文本分类领域受到了极大的关注。
Efstathios Stamatatos(2009)提出了一种基于SVM的作者归因方法,并在几个数据集上展示了其有效性。作者提出了一种特征选择技术来降低数据的维数,提高SVM的分类精度。结果表明,SVM优于其他机器学习算法,如k-NN和决策树,表明其适用于作者归因任务。
逻辑回归(LogicRegression)是一种流行的机器学习算法,用于对任务进行分类,包括作者归属和作者变化检测。在作者归因的背景下,逻辑回归通常会创建一个模型,学习区分不同作者的写作风格。该模型可用于判断未知文档中的作者,或根据其写作风格获取文本片段。该模型可以在数据集上进行训练,该数据集包含具有已知作者身份变化的文本片段的示例,然后用于预测新文本片段中作者身份发生变化的可能性。
几项研究已经证明了逻辑回归对作者归属和作者变更检测的有效性。例如,Stamatatos(2013)对使用逻辑回归进行作者归因的几项研究进行了综述,而Potthast等人(2011)使用逻辑回归来检测维基百科文章数据集中的作者变化。Woolf(2016)提供了一个关于如何在scikit学习中使用逻辑回归进行作者归因的分步指南,而De Coi等人(2020)提出了一种基于逻辑回归的作者归因方法,该方法结合了词汇、句法和风格特征。
后面有实现这两种方法的完整Python代码。
运行过程:
所有程序也由Python使用Spyder运行。
首先,打开generate_text_features.py,直接运行生成一个名为features的文件夹。
这将使用来自列车文件夹和验证文件夹的数据
其次,打开task1.py并直接运行以生成一个名为saved_models的文件夹。
然后,使用task2.py和task3.py执行相同的操作
(处理顺序必须遵循task1、taks2和task3)
第三,打开main,并更改存储LogisticRegission.pickle的三条路由。
类似地,如果想要获得svc结果,那么将LogisticRegression.pickle更改为svc.pickle,那么它就可以工作了。
但是由于svc不包含段落作者信息,
所以提交与TASK3_MODEL相关的部分,然后它就可以运行
第五,使用-i-o来运行它,其格式与类似:
运行文件(‘D:/Final/Others/main.py’,args=‘-i test-o result1’,wdir='D:/Final/Others)
注意:运行时记得改变路线
generate_text_features:
import json
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
import numpy as np
import pickle
import textstat
import time
from tqdm import tqdm
import os
def count_occurence(check_word_list, word_list_all):
num_count = 0
for w in check_word_list:
if w in word_list_all:
num_count += word_list_all[w]
return num_count
def count_occurence_phrase(phrase_list, para):
num_count = 0
for phrase in phrase_list:
num_count += para.count(phrase)
return num_count
def extract_features(document):
feature_all = []
for para in document:
sent_list = sent_tokenize(para)
word_dict = {}
sent_length_list = [0, 0, 0, 0, 0, 0] # 0-10,10-20,20-30,30-40,40-50,>50
pos_tag_list = [0] * 15
for sent in sent_list:
w_list = word_tokenize(sent)
for (word, tag) in pos_tag(w_list):
if tag in ['PRP']:
pos_tag_list[0] += 1
if tag.startswith('J'):
pos_tag_list[1] += 1
if tag.startswith('N'):
pos_tag_list[2] += 1
if tag.startswith('V'):
pos_tag_list[3] += 1
if tag in ['PRP', 'PRP$', 'WP', 'WP$']:
pos_tag_list[4] += 1
elif tag in ['IN']:
pos_tag_list[5] += 1
elif tag in ['CC']:
pos_tag_list[6] += 1
elif tag in ['RB', 'RBR', 'RBS']:
pos_tag_list[7] += 1
elif tag in ['DT', 'PDT', 'WDT']:
pos_tag_list[8] += 1
elif tag in ['UH']:
pos_tag_list[9] += 1
elif tag in ['MD']:
pos_tag_list[10] += 1
if len(word) >= 8:
pos_tag_list[11] += 1
elif len(word) in [2, 3, 4]:
pos_tag_list[12] += 1
if word.isupper():
pos_tag_list[13] += 1
elif word[0].isupper():
pos_tag_list[14] += 1
num_words_sent = len(w_list)
if num_words_sent >= 50:
sent_length_list[-1] += 1
else:
sent_length_list[int(num_words_sent / 10)] += 1
for w in w_list:
if len(w) > 20:
w = '<Long_word>'
word_dict.setdefault(w, 0)
word_dict[w] += 1
base_feat1 = [len(sent_list), len(word_dict)] + sent_length_list + pos_tag_list # num_sentences, num_words
special_char = [';', ':', '(', '/', '&', ')', '\\', '\'', '"', '%', '?', '!', '.', '*', '@']
char_feat = [para.count(char) for char in special_char]
with open('_function_words.json', 'r') as f:
function_words = json.load(f)
function_words_feature = []
for w in function_words['words']:
if w in word_dict:
function_words_feature.append(word_dict[w])
else:
function_words_feature.append(0)
function_phrase_feature = [para.count(p) for p in function_words['phrases']]
with open('_difference_words.json', 'r') as f:
difference_dict = json.load(f)
difference_words_feat = [count_occurence(difference_dict['word']['number'][0], word_dict),
count_occurence(difference_dict['word']['number'][1], word_dict),
count_occurence(difference_dict['word']['spelling'][0], word_dict),
count_occurence(difference_dict['word']['spelling'][1], word_dict),
count_occurence_phrase(difference_dict['phrase'][0], para),
count_occurence_phrase(difference_dict['phrase'][1], para)]
textstat_feat = [textstat.flesch_reading_ease(para),
textstat.smog_index(para),
textstat.flesch_kincaid_grade(para),
textstat.coleman_liau_index(para),
textstat.automated_readability_index(para),
textstat.dale_chall_readability_score(para),
textstat.difficult_words(para),
textstat.linsear_write_formula(para),
textstat.gunning_fog(para)]
feature = base_feat1 + function_words_feature + function_phrase_feature + difference_words_feat + char_feat + textstat_feat
feature_all.append(feature)
return np.asarray(feature_all)
def generate_features(documents):
features_per_document = []
features_per_paragraph = []
with tqdm(documents, unit="document", desc=f"Generating features") as pbar:
for doc in pbar:
para_features = extract_features(doc)
doc_features = sum(para_features)
features_per_document.append(doc_features)
features_per_paragraph.append(para_features)
return np.array(features_per_document), np.array(features_per_paragraph, dtype=object)
def main():
from utilities import load_documents
# Load documents
train_docs, train_doc_ids = load_documents('train')
val_docs, val_doc_ids = load_documents('validation')
# NB! Generating features takes a long time
train_doc_textf, train_par_textf = generate_features(train_docs)
val_doc_textf, val_par_textf = generate_features(val_docs)
# timestring = time.strftime("%Y%m%d-%H%M")
if not os.path.exists('./features'):
os.makedirs('./features')
with open('./features/' + '_doc_textf_train.pickle', 'wb') as handle:
pickle.dump(train_doc_textf, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('./features/' + '_par_textf_train.pickle', 'wb') as handle:
pickle.dump(train_par_textf, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('./features/' + '_doc_textf_val.pickle', 'wb') as handle:
pickle.dump(val_doc_textf, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('./features/' + '_par_textf_val.pickle', 'wb') as handle:
pickle.dump(val_par_textf, handle, protocol=pickle.HIGHEST_PROTOCOL)
if __name__ == '__main__':
main()
task1:
import numpy as np
import pickle
import os
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from utilities import task1_load_cases, task2_load_cases, task3_load_cases
def task1_test():
x_train, y_train, x_val, y_val = task1_load_cases(feature="textf", shuffle=True)
classifiers = [
SVC(),
LogisticRegression(max_iter=1000),
]
names = []
scores = []
for i, clf in enumerate(classifiers):
names.append(str(type(clf)).split(".")[-1][:-2])
print(f'Fitting {i + 1}/{len(classifiers)}: {names[i]}')
clf.fit(x_train, y_train)
preds = clf.predict(x_val)
scores.append(f1_score(y_val, preds, average='macro'))
if not os.path.exists('./saved_models'):
os.makedirs('./saved_models')
with open(f'./saved_models/task1_{names[i]}.pickle', 'wb') as handle:
pickle.dump(clf, handle, protocol=pickle.HIGHEST_PROTOCOL)
if __name__ == '__main__':
task1_test()
#task1_lgbm()
# task1_stacking_ensemble()
task2:
import numpy as np
import pickle
import os
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import BernoulliNB
import lightgbm as lgb
from lightgbm import LGBMClassifier
from stacking_ensemble import SklearnWrapper, LightGBMWrapper, StackingEnsemble
from utilities import lgbm_macro_f1, task2_load_cases
# lgb_params_emb = {
# 'seed': 0,
# 'objective': 'binary',
# 'verbose': -1,
# 'lambda_l1': 0.0,
# 'lambda_l2': 0.0,
# 'num_leaves': 31,
# 'feature_fraction': 0.716,
# 'bagging_fraction': 1.0,
# 'bagging_freq': 0,
# 'min_child_samples': 20,
# 'is_unbalance': 'true'}
# lgb_params_textf = {
# 'seed': 0,
# 'objective': 'binary',
# 'verbose': -1,
# 'lambda_l1': 6.700219383691919,
# 'lambda_l2': 1.001343889145046e-08,
# 'num_leaves': 155,
# 'feature_fraction': 0.748,
# 'bagging_fraction': 0.9714173348977203,
# 'bagging_freq': 2,
# 'min_child_samples': 20,
# 'is_unbalance': 'true'}
# def task2_lgbm():
# x_train, y_train, x_val, y_val = task2_load_cases(feature="textf", shuffle=True)
# train_ds = lgb.Dataset(x_train, label=y_train)
# val_ds = lgb.Dataset(x_val, label=y_val)
# model = lgb.train(lgb_params_textf, train_ds, valid_sets=[train_ds, val_ds], feval=lgbm_macro_f1,
# num_boost_round=10000, early_stopping_rounds=250, verbose_eval=250)
# preds = np.round(model.predict(x_val))
# ac = accuracy_score(y_val, preds)
# f1 = f1_score(y_val, preds, average='macro')
# f1_micro = f1_score(y_val, preds, average='micro')
# print(f"Evaluation: accuracy {ac:0.4f}, macro-F1 {f1:0.4f}, F1-micro {f1_micro:0.4f}")
# if not os.path.exists('./saved_models'):
# os.makedirs('./saved_models')
# with open(f'./saved_models/task1_lgbm_{round(f1 * 100)}.pickle', 'wb') as handle:
# pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
# def task2_stacking_ensemble():
# x_train_emb, y_train, x_val_emb, y_val = task2_load_cases(feature="emb", shuffle=False)
# x_train_textf, _, x_val_textf, _ = task2_load_cases(feature="textf", shuffle=False)
# classifiers_emb = [
# LightGBMWrapper(clf=LGBMClassifier, params=lgb_params_emb),
# SklearnWrapper(clf=RandomForestClassifier()),
# SklearnWrapper(clf=MLPClassifier(max_iter=1000)),
# SklearnWrapper(clf=BernoulliNB())]
# classifiers_textf = [
# LightGBMWrapper(clf=LGBMClassifier, params=lgb_params_textf),
# SklearnWrapper(clf=RandomForestClassifier()),
# SklearnWrapper(clf=MLPClassifier(max_iter=1000)),
# SklearnWrapper(clf=KNeighborsClassifier())]
# ensemble = StackingEnsemble()
# # Training ensemble on embeddings
# ensemble.add_to_ensemble(classifiers_emb, x_train_emb, y_train, x_val_emb, y_val, feature_set_name="emb")
# # Training ensemble on text features
# ensemble.add_to_ensemble(classifiers_textf, x_train_textf, y_train, x_val_textf, y_val, feature_set_name="textf")
# ensemble.train_meta_learner()
# preds = ensemble.predict([x_val_emb, x_val_textf])
# ac = accuracy_score(y_val, preds)
# f1 = f1_score(y_val, preds, average='macro')
# f1_micro = f1_score(y_val, preds, average='micro')
# print(f"Evaluation: accuracy {ac:0.4f}, macro-F1 {f1:0.4f}, F1-micro {f1_micro:0.4f}")
# if not os.path.exists('./saved_models'):
# os.makedirs('./saved_models')
# with open(f'./saved_models/task1_ensemble_{round(f1 * 100)}.pickle', 'wb') as handle:
# pickle.dump(ensemble, handle, protocol=pickle.HIGHEST_PROTOCOL)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
def task2_test():
x_train, y_train, x_val, y_val = task2_load_cases(feature="textf", shuffle=True)
classifiers = [
SVC(),
#LogisticRegression(max_iter=1000),
]
names = []
scores = []
for i, clf in enumerate(classifiers):
names.append(str(type(clf)).split(".")[-1][:-2])
print(f'Fitting {i + 1}/{len(classifiers)}: {names[i]}')
clf.fit(x_train, y_train)
preds = clf.predict(x_val)
scores.append(f1_score(y_val, preds, average='macro'))
print(preds)
if not os.path.exists('./saved_models'):
os.makedirs('./saved_models')
with open(f'./saved_models/task2_{names[i]}.pickle', 'wb') as handle:
pickle.dump(clf, handle, protocol=pickle.HIGHEST_PROTOCOL)
if __name__ == '__main__':
task2_test()
# task2_lgbm()
# task2_stacking_ensemble()
task3:
import numpy as np
import pickle
import os
from sklearn.metrics import f1_score, accuracy_score
from utilities import lgbm_macro_f1, task3_load_cases
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
def task3_test():
x_train, y_train, x_val, y_val = task3_load_cases(feature="textf", shuffle=True)
classifiers = [
#SVC(),
LogisticRegression(max_iter=1000),
]
names = []
scores = []
for i, clf in enumerate(classifiers):
names.append(str(type(clf)).split(".")[-1][:-2])
print(f'Fitting {i + 1}/{len(classifiers)}: {names[i]}')
clf.fit(x_train, y_train)
preds = clf.predict(x_val)
scores.append(f1_score(y_val, preds, average='macro'))
print(scores)
if not os.path.exists('./saved_models'):
os.makedirs('./saved_models')
with open(f'./saved_models/task3_{names[i]}.pickle', 'wb') as handle:
pickle.dump(clf, handle, protocol=pickle.HIGHEST_PROTOCOL)
if __name__ == '__main__':
task3_test()
main:
import json
import pickle
import argparse
import os
import sys
import numpy as np
import time
import glob
from generate_embeddings import generate_embeddings
from generate_text_features import generate_features
from utilities import load_documents, task2_parchange_predictions, task3_binary_predictions, task3_authorship_predictions
# Find all pickle files in the saved_models1 directory
pickle_files = glob.glob("pan21-style-change-detection-stacking-ensemble-master/saved_models1/*.pickle")
TASK1_MODEL = "D:Final/Others/saved_models/task1_LogisticRegression.pickle"
TASK2_MODEL = "D:Final/Others/saved_models/task2_LogisticRegression.pickle"
TASK3_MODEL = "D:Final/Others/saved_models/task3_LogisticRegression.pickle"
def typeconverter(obj):
if isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.floating):
return float(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
def main(data_folder, output_folder):
start_time = time.time()
# Load documents
docs, doc_ids = load_documents(data_folder)
print(f"Loaded {len(docs)} documents ...")
# Generate document and paragraph features
#doc_emb, par_emb = generate_embeddings(docs)
doc_textf, par_textf = generate_features(docs)
# Task 1
print("Task 1 predictions ...")
task1_ensemble = pickle.load(open(TASK1_MODEL, "rb"))
task1_preds_proba = task1_ensemble.predict(doc_textf)
task1_preds = np.round(task1_preds_proba)
print(task1_preds)
del task1_ensemble, doc_textf
# Task 2
print("Task 2 predictions ...")
task2_ensemble = pickle.load(open(TASK2_MODEL, "rb"))
task2_preds = task2_parchange_predictions(task2_ensemble, par_textf, par_textf)
print(task2_preds)
del task2_ensemble
# Task 3
print("Task 3 predictions ...")
task3_ensemble = pickle.load(open(TASK3_MODEL, "rb"))
task3_binary_preds = task3_binary_predictions(task1_preds_proba, task3_ensemble, par_textf, par_textf)
task3_preds = task3_authorship_predictions(task1_preds_proba, task3_binary_preds, par_textf, par_textf)
print(task3_preds)
del task1_preds_proba, task3_binary_preds, task3_ensemble
# Save solutions
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for i in range(len(task1_preds)):
solution = {
'multi-author': task1_preds[i],
'changes': task2_preds[i],
'paragraph-authors': task3_preds[i]
}
file_name = r'solution-problem-' + str(i + 1) + '.json'
with open(os.path.join(output_folder, file_name), 'w') as file_handle:
json.dump(solution, file_handle, default=typeconverter)
print(f"Run finished after {(time.time() - start_time) / 60:0.2f} minutes.")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='PAN21 Style Change Detection software submission')
parser.add_argument("-i", "--input_dir", help="path to the dir holding the data", required=True)
parser.add_argument("-o", "--output_dir", help="path to the dir to write the results to", required=True)
args = parser.parse_args()
main(args.input_dir, args.output_dir)