续前缘
本次任务
任务二:
|
Baseline精读
本次精读基于以下视频链接:
【AI夏令营】NLP赛题解析与Baseline逐行精读
此文将Baseline大致分为五部分来理解,首先是代码文件库的引用
# 导入pandas用于读取表格数据
import pandas as pd
# 导入BOW(词袋模型),可以选择将CountVectorizer替换为TfidfVectorizer(TF-IDF(词频-逆文档频率)),注意上下文要同时修改,亲测后者效果更佳
from sklearn.feature_extraction.text import TfidfVectorizer
# 导入LogisticRegression回归模型
from sklearn.linear_model import LogisticRegression
# 引入分词器
from nltk import word_tokenize, ngrams
# 过滤警告消息
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)
读取数据集
# 读取数据集
train = pd.read_csv('/home/aistudio/data/data231041/train.csv')
train['title'] = train['title'].fillna('')
train['abstract'] = train['abstract'].fillna('')
test = pd.read_csv('/home/aistudio/data/data231041/testB.csv')
test['title'] = test['title'].fillna('')
test['abstract'] = test['abstract'].fillna('')
# 提取文本特征,生成训练集与测试集
train['text'] = train['title'].fillna('') + ' ' + train['author'].fillna('') + ' ' + train['abstract'].fillna('')+ ' ' + train['Keywords'].fillna('')
test['text'] = test['title'].fillna('') + ' ' + test['author'].fillna('') + ' ' + test['abstract'].fillna('')
vector = TfidfVectorizer().fit(train['text'])
train_vector = vector.transform(train['text'])
test_vector = vector.transform(test['text'])
医学分类任务的训练
机器需要根据对论文摘要等信息的理解,将论文划分为医学领域的文献和非医学领域的文献两个类别之一。
# 这一部分主要是对测试集的文章进行是否是医学类文章的辨别
# 引入模型
model = LogisticRegression()
# 开始训练,这里可以考虑修改默认的batch_size与epoch来取得更好的效果
model.fit(train_vector, train['label'])
# 利用模型对测试集label标签进行预测
test['label'] = model.predict(test_vector)
关键词提取任务
论文关键词划分为两类:
- 在标题和摘要中出现的关键词
- 没有在标题和摘要中出的关键词
代码中所使用的“stop.txt”的下载csdn站内也有,自行搜索即可。
# 引入分词器
from nltk import word_tokenize, ngrams
# 定义停用词,去掉出现较多,但对文章不关键的词语
'''stops = [
'will', 'can', "couldn't", 'same', 'own', "needn't", 'between', "shan't", 'very',
'so', 'over', 'in', 'have', 'the', 's', 'didn', 'few', 'should', 'of', 'that',
'don', 'weren', 'into', "mustn't", 'other', 'from', "she's", 'hasn', "you're",
'ain', 'ours', 'them', 'he', 'hers', 'up', 'below', 'won', 'out', 'through',
'than', 'this', 'who', "you've", 'on', 'how', 'more', 'being', 'any', 'no',
'mightn', 'for', 'again', 'nor', 'there', 'him', 'was', 'y', 'too', 'now',
'whom', 'an', 've', 'or', 'itself', 'is', 'all', "hasn't", 'been', 'themselves',
'wouldn', 'its', 'had', "should've", 'it', "you'll", 'are', 'be', 'when', "hadn't",
"that'll", 'what', 'while', 'above', 'such', 'we', 't', 'my', 'd', 'i', 'me',
'at', 'after', 'am', 'against', 'further', 'just', 'isn', 'haven', 'down',
"isn't", "wouldn't", 'some', "didn't", 'ourselves', 'their', 'theirs', 'both',
're', 'her', 'ma', 'before', "don't", 'having', 'where', 'shouldn', 'under',
'if', 'as', 'myself', 'needn', 'these', 'you', 'with', 'yourself', 'those',
'each', 'herself', 'off', 'to', 'not', 'm', "it's", 'does', "weren't", "aren't",
'were', 'aren', 'by', 'doesn', 'himself', 'wasn', "you'd", 'once', 'because', 'yours',
'has', "mightn't", 'they', 'll', "haven't", 'but', 'couldn', 'a', 'do', 'hadn',
"doesn't", 'your', 'she', 'yourselves', 'o', 'our', 'here', 'and', 'his', 'most',
'about', 'shan', "wasn't", 'then', 'only', 'mustn', 'doing', 'during', 'why',
"won't", 'until', 'did', "shouldn't", 'which'
]'''
stops =[i.strip() for i in open(r'stop.txt',encoding='utf-8').readlines()]
# 定义方法按照词频筛选关键词
def extract_keywords_by_freq(title, abstract):
ngrams_count = list(ngrams(word_tokenize(title.lower()), 2)) + list(ngrams(word_tokenize(abstract.lower()), 2))
ngrams_count = pd.DataFrame(ngrams_count)
ngrams_count = ngrams_count[~ngrams_count[0].isin(stops)]
ngrams_count = ngrams_count[~ngrams_count[1].isin(stops)]
ngrams_count = ngrams_count[ngrams_count[0].apply(len) > 3]
ngrams_count = ngrams_count[ngrams_count[1].apply(len) > 3]
ngrams_count['phrase'] = ngrams_count[0] + ' ' + ngrams_count[1]
ngrams_count = ngrams_count['phrase'].value_counts()
ngrams_count = ngrams_count[ngrams_count > 1]
return list(ngrams_count.index)[:6]
## 对测试集提取关键词
test_words = []
for row in test.iterrows():
# 读取第每一行数据的标题与摘要并提取关键词
prediction_keywords = extract_keywords_by_freq(row[1].title, row[1].abstract)
# 利用文章标题进一步提取关键词
prediction_keywords = [x.title() for x in prediction_keywords]
# 如果未能提取到关键词
if len(prediction_keywords) == 0:
prediction_keywords = ['A', 'B']
test_words.append('; '.join(prediction_keywords))
csv的输出
test['Keywords'] = test_words
test[['uuid', 'Keywords', 'label']].to_csv('submit_task1.csv', index=None)
Baseline优化
目前来看,对于可优化的方向自己并没有什么思路,做的只是把CountVectorizer替换为TfidfVectorizer来训练。得到的训练结果如图所示:
数据集有所变动,所以得到的分数可能不是很耐看。希望在接下来的学习之中能继续有所收获,同时提高赛题的分数。