nlp新闻文本 task3打卡

Task 2 部分:

import pandas as pd
train_df = pd.read_csv('./train_set.csv', sep='\t')
train_df.head()
labeltext
022967 6758 339 2021 1854 3731 4109 3792 4149 15...
1114464 486 6352 5619 2465 4802 1452 3137 5778 54...
237346 4068 5074 3747 5681 6093 1777 2226 7354 6...
327159 948 4866 2109 5520 2490 211 3956 5520 549...
433646 3055 3055 2490 4659 6065 3370 5814 2465 5...
train_df['text_len'] = train_df['text'].apply(lambda x: len(x.split(' ')))
train_df
labeltexttext_len
022967 6758 339 2021 1854 3731 4109 3792 4149 15...1057
1114464 486 6352 5619 2465 4802 1452 3137 5778 54...486
237346 4068 5074 3747 5681 6093 1777 2226 7354 6...764
327159 948 4866 2109 5520 2490 211 3956 5520 549...1570
433646 3055 3055 2490 4659 6065 3370 5814 2465 5...307
............
1999952307 4894 7539 4853 5330 648 6038 4409 3764 603...868
19999623792 2983 355 1070 4464 5050 6298 3782 3130 68...1142
199997116811 1580 7539 1252 1899 5139 1386 3870 4124 1...1180
19999826405 3203 6644 983 794 1913 1678 5736 1397 191...179
19999934350 3878 3268 1699 6909 5505 2376 2465 6088 2...2098

200000 rows × 3 columns

train_df['text_len'].describe()
count    200000.000000
mean        907.207110
std         996.029036
min           2.000000
25%         374.000000
50%         676.000000
75%        1131.000000
max       57921.000000
Name: text_len, dtype: float64
%matplotlib inline
import matplotlib.pyplot as plt 
plt.hist(train_df['text_len'], bins=200)
plt.xlabel('Text char count')
plt.title("Histogram of char count")
Text(0.5, 1.0, 'Histogram of char count')

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-JWcm7t29-1595353132144)(output_5_1.png)]

train_df['label'].value_counts().plot(kind='bar')
plt.title('News class count')
plt.xlabel("category")
Text(0.5, 0, 'category')

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Mvu19oiS-1595353132149)(output_6_1.png)]

from collections import Counter
all_lines = ' '.join(list(train_df['text']))
word_count = Counter(all_lines.split(" "))
word_count = sorted(word_count.items(), key=lambda d:d[1], reverse = True)
print(len(word_count))
# 6869

print(word_count[0])
# ('3750', 7482224)

print(word_count[-1])
# ('3133', 1)
train_df['text_unique'] = train_df['text'].apply(lambda x: ' '.join(list(set(x.split(' ')))))
all_lines = ' '.join(list(train_df['text_unique']))
word_count = Counter(all_lines.split(" "))
word_count = sorted(word_count.items(), key=lambda d:int(d[1]), reverse = True)
print(word_count[0])
# ('3750', 197997)

print(word_count[1])
# ('900', 197653)

print(word_count[2])
# ('648', 191975)

Task 2 相关问题解答:

#1.每个标点对应一个句子
seq=[]
num=0
for line in train_df['text']:
    for word in line.split(' '):
        if word in ['3750','900','648']:
            num+=1
    seq.append(num)
    num=0
mean_seq=sum(seq)/len(seq)
mean_seq
78.34829
#2.对每篇文本进行去重,然后统计相同类别的文本出现次数最多的字符
train_df['text_unique'] = train_df['text'].apply(lambda x: ' '.join(list(set(x.split(' ')))))
train_df['label'].unique()#所有标签列
array([ 2, 11,  3,  9, 10, 12,  0,  7,  4,  1,  6,  5,  8, 13],
      dtype=int64)
sequence=[]
for i in train_df['label'].unique():
    res=' '.join(train_df[train_df['label']==i]['text_unique'])#每个类别所有文本连接后的结果
    sequence.append(res)
m=0
dic={}
word=''
r=[]
for i in sequence:
    for j in i.split(' '):
        dic[j]=dic.setdefault(j,0)+1
        if (dic[j]>m) and (j not in['3750','900','648']):
            word=j
            m=dic[j]
    r.append(word)
    m=0
    dic={}
    word=''
r#每类文本出现最多的字符
['7399',
 '6122',
 '2465',
 '7399',
 '885',
 '3370',
 '2465',
 '3370',
 '4853',
 '3370',
 '5620',
 '6122',
 '6122',
 '2662']
#文本标签及其除去标点符号后对应的出现最多的字符
for i in range(len(r)):
    print(train_df['label'].unique()[i],r[i])
2 7399
11 6122
3 2465
9 7399
10 885
12 3370
0 2465
7 3370
4 4853
1 3370
6 5620
5 6122
8 6122
13 2662

Task 3 部分:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import f1_score

train_df = pd.read_csv('./train_set.csv', sep='\t', nrows=15000)

vectorizer = CountVectorizer(max_features=3000)
train_test = vectorizer.fit_transform(train_df['text'])

clf = RidgeClassifier()
clf.fit(train_test[:10000], train_df['label'].values[:10000])

val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))
0.741494277019762
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import f1_score
tfidf = TfidfVectorizer(ngram_range=(1,3), max_features=3000)
train_test = tfidf.fit_transform(train_df['text'])

clf = RidgeClassifier()
clf.fit(train_test[:10000], train_df['label'].values[:10000])

val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))
0.8721598830546126

Task 3 相关问题解答:

1.变化TfidfVectorizer的参数,找到最合适的值

for i in range(1,6):
    for j in [1000,2000,3000,4000,5000]:
        tfidf = TfidfVectorizer(ngram_range=(1,i), max_features=j)
        train_test = tfidf.fit_transform(train_df['text'])

        clf = RidgeClassifier()
        clf.fit(train_test[:10000], train_df['label'].values[:10000])

        val_pred = clf.predict(train_test[10000:])
        print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))
0.835944644945302
0.8610501146547297
0.8588110190969614
0.8601916764212559
0.8603325900148268
0.8288900927279318
0.8584782097110735
0.8727002372924054
0.8794233135546486
0.8862954550407692
0.8270776630718544
0.8603842642428617
0.8721598830546126
0.8753274805998447
0.8850817067811825

由此可知ngram_range=(1,2),max_features=5000时得分最高

2.换用不同模型训练数据,观察得分情况

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
n_estimators=range(100,1100,100)
hyper={'n_estimators':n_estimators}
gd=GridSearchCV(estimator=RandomForestClassifier(),param_grid=hyper,verbose=True)
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
train_test = tfidf.fit_transform(train_df['text'])
gd.fit(train_test[:10000],train_df['label'].values[:10000])
print(gd.best_score_)
print(gd.best_estimator_)
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 167.9min finished


0.8684000000000001
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=900,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
model=RandomForestClassifier(n_estimators=900)
model.fit(train_test[:10000],train_df['label'].values[:10000])
pred=model.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], pred, average='macro'))
0.7979654404582549

此处存在过拟合现象,可通过交叉验证降低过拟合风险

换用svm模型训练数据

from sklearn import svm
model=svm.SVC(C=1.0,gamma=0.1)
model.fit(train_test[:10000],train_df['label'].values[:10000])
pred=model.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], pred, average='macro'))
0.8046857446164133

时间关系没有遍历svm参数以及使用交叉验证,后续逐步加入


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值