Task 2 部分:
import pandas as pd
train_df = pd.read_csv('./train_set.csv', sep='\t')
train_df.head()
label | text | |
---|---|---|
0 | 2 | 2967 6758 339 2021 1854 3731 4109 3792 4149 15... |
1 | 11 | 4464 486 6352 5619 2465 4802 1452 3137 5778 54... |
2 | 3 | 7346 4068 5074 3747 5681 6093 1777 2226 7354 6... |
3 | 2 | 7159 948 4866 2109 5520 2490 211 3956 5520 549... |
4 | 3 | 3646 3055 3055 2490 4659 6065 3370 5814 2465 5... |
train_df['text_len'] = train_df['text'].apply(lambda x: len(x.split(' ')))
train_df
label | text | text_len | |
---|---|---|---|
0 | 2 | 2967 6758 339 2021 1854 3731 4109 3792 4149 15... | 1057 |
1 | 11 | 4464 486 6352 5619 2465 4802 1452 3137 5778 54... | 486 |
2 | 3 | 7346 4068 5074 3747 5681 6093 1777 2226 7354 6... | 764 |
3 | 2 | 7159 948 4866 2109 5520 2490 211 3956 5520 549... | 1570 |
4 | 3 | 3646 3055 3055 2490 4659 6065 3370 5814 2465 5... | 307 |
... | ... | ... | ... |
199995 | 2 | 307 4894 7539 4853 5330 648 6038 4409 3764 603... | 868 |
199996 | 2 | 3792 2983 355 1070 4464 5050 6298 3782 3130 68... | 1142 |
199997 | 11 | 6811 1580 7539 1252 1899 5139 1386 3870 4124 1... | 1180 |
199998 | 2 | 6405 3203 6644 983 794 1913 1678 5736 1397 191... | 179 |
199999 | 3 | 4350 3878 3268 1699 6909 5505 2376 2465 6088 2... | 2098 |
200000 rows × 3 columns
train_df['text_len'].describe()
count 200000.000000
mean 907.207110
std 996.029036
min 2.000000
25% 374.000000
50% 676.000000
75% 1131.000000
max 57921.000000
Name: text_len, dtype: float64
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist(train_df['text_len'], bins=200)
plt.xlabel('Text char count')
plt.title("Histogram of char count")
Text(0.5, 1.0, 'Histogram of char count')
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-JWcm7t29-1595353132144)(output_5_1.png)]
train_df['label'].value_counts().plot(kind='bar')
plt.title('News class count')
plt.xlabel("category")
Text(0.5, 0, 'category')
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Mvu19oiS-1595353132149)(output_6_1.png)]
from collections import Counter
all_lines = ' '.join(list(train_df['text']))
word_count = Counter(all_lines.split(" "))
word_count = sorted(word_count.items(), key=lambda d:d[1], reverse = True)
print(len(word_count))
# 6869
print(word_count[0])
# ('3750', 7482224)
print(word_count[-1])
# ('3133', 1)
train_df['text_unique'] = train_df['text'].apply(lambda x: ' '.join(list(set(x.split(' ')))))
all_lines = ' '.join(list(train_df['text_unique']))
word_count = Counter(all_lines.split(" "))
word_count = sorted(word_count.items(), key=lambda d:int(d[1]), reverse = True)
print(word_count[0])
# ('3750', 197997)
print(word_count[1])
# ('900', 197653)
print(word_count[2])
# ('648', 191975)
Task 2 相关问题解答:
#1.每个标点对应一个句子
seq=[]
num=0
for line in train_df['text']:
for word in line.split(' '):
if word in ['3750','900','648']:
num+=1
seq.append(num)
num=0
mean_seq=sum(seq)/len(seq)
mean_seq
78.34829
#2.对每篇文本进行去重,然后统计相同类别的文本出现次数最多的字符
train_df['text_unique'] = train_df['text'].apply(lambda x: ' '.join(list(set(x.split(' ')))))
train_df['label'].unique()#所有标签列
array([ 2, 11, 3, 9, 10, 12, 0, 7, 4, 1, 6, 5, 8, 13],
dtype=int64)
sequence=[]
for i in train_df['label'].unique():
res=' '.join(train_df[train_df['label']==i]['text_unique'])#每个类别所有文本连接后的结果
sequence.append(res)
m=0
dic={}
word=''
r=[]
for i in sequence:
for j in i.split(' '):
dic[j]=dic.setdefault(j,0)+1
if (dic[j]>m) and (j not in['3750','900','648']):
word=j
m=dic[j]
r.append(word)
m=0
dic={}
word=''
r#每类文本出现最多的字符
['7399',
'6122',
'2465',
'7399',
'885',
'3370',
'2465',
'3370',
'4853',
'3370',
'5620',
'6122',
'6122',
'2662']
#文本标签及其除去标点符号后对应的出现最多的字符
for i in range(len(r)):
print(train_df['label'].unique()[i],r[i])
2 7399
11 6122
3 2465
9 7399
10 885
12 3370
0 2465
7 3370
4 4853
1 3370
6 5620
5 6122
8 6122
13 2662
Task 3 部分:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import f1_score
train_df = pd.read_csv('./train_set.csv', sep='\t', nrows=15000)
vectorizer = CountVectorizer(max_features=3000)
train_test = vectorizer.fit_transform(train_df['text'])
clf = RidgeClassifier()
clf.fit(train_test[:10000], train_df['label'].values[:10000])
val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))
0.741494277019762
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import f1_score
tfidf = TfidfVectorizer(ngram_range=(1,3), max_features=3000)
train_test = tfidf.fit_transform(train_df['text'])
clf = RidgeClassifier()
clf.fit(train_test[:10000], train_df['label'].values[:10000])
val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))
0.8721598830546126
Task 3 相关问题解答:
1.变化TfidfVectorizer的参数,找到最合适的值
for i in range(1,6):
for j in [1000,2000,3000,4000,5000]:
tfidf = TfidfVectorizer(ngram_range=(1,i), max_features=j)
train_test = tfidf.fit_transform(train_df['text'])
clf = RidgeClassifier()
clf.fit(train_test[:10000], train_df['label'].values[:10000])
val_pred = clf.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], val_pred, average='macro'))
0.835944644945302
0.8610501146547297
0.8588110190969614
0.8601916764212559
0.8603325900148268
0.8288900927279318
0.8584782097110735
0.8727002372924054
0.8794233135546486
0.8862954550407692
0.8270776630718544
0.8603842642428617
0.8721598830546126
0.8753274805998447
0.8850817067811825
由此可知ngram_range=(1,2),max_features=5000时得分最高
2.换用不同模型训练数据,观察得分情况
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
n_estimators=range(100,1100,100)
hyper={'n_estimators':n_estimators}
gd=GridSearchCV(estimator=RandomForestClassifier(),param_grid=hyper,verbose=True)
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
train_test = tfidf.fit_transform(train_df['text'])
gd.fit(train_test[:10000],train_df['label'].values[:10000])
print(gd.best_score_)
print(gd.best_estimator_)
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 167.9min finished
0.8684000000000001
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=900,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
model=RandomForestClassifier(n_estimators=900)
model.fit(train_test[:10000],train_df['label'].values[:10000])
pred=model.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], pred, average='macro'))
0.7979654404582549
此处存在过拟合现象,可通过交叉验证降低过拟合风险
换用svm模型训练数据
from sklearn import svm
model=svm.SVC(C=1.0,gamma=0.1)
model.fit(train_test[:10000],train_df['label'].values[:10000])
pred=model.predict(train_test[10000:])
print(f1_score(train_df['label'].values[10000:], pred, average='macro'))
0.8046857446164133
时间关系没有遍历svm参数以及使用交叉验证,后续逐步加入