import pandas as pd
train_df = pd.read_csv('./train_set.csv', sep='\t')
train_df.head()
| label | text |
---|
0 | 2 | 2967 6758 339 2021 1854 3731 4109 3792 4149 15... |
---|
1 | 11 | 4464 486 6352 5619 2465 4802 1452 3137 5778 54... |
---|
2 | 3 | 7346 4068 5074 3747 5681 6093 1777 2226 7354 6... |
---|
3 | 2 | 7159 948 4866 2109 5520 2490 211 3956 5520 549... |
---|
4 | 3 | 3646 3055 3055 2490 4659 6065 3370 5814 2465 5... |
---|
train_df['text_len'] = train_df['text'].apply(lambda x: len(x.split(' ')))
train_df
| label | text | text_len |
---|
0 | 2 | 2967 6758 339 2021 1854 3731 4109 3792 4149 15... | 1057 |
---|
1 | 11 | 4464 486 6352 5619 2465 4802 1452 3137 5778 54... | 486 |
---|
2 | 3 | 7346 4068 5074 3747 5681 6093 1777 2226 7354 6... | 764 |
---|
3 | 2 | 7159 948 4866 2109 5520 2490 211 3956 5520 549... | 1570 |
---|
4 | 3 | 3646 3055 3055 2490 4659 6065 3370 5814 2465 5... | 307 |
---|
... | ... | ... | ... |
---|
199995 | 2 | 307 4894 7539 4853 5330 648 6038 4409 3764 603... | 868 |
---|
199996 | 2 | 3792 2983 355 1070 4464 5050 6298 3782 3130 68... | 1142 |
---|
199997 | 11 | 6811 1580 7539 1252 1899 5139 1386 3870 4124 1... | 1180 |
---|
199998 | 2 | 6405 3203 6644 983 794 1913 1678 5736 1397 191... | 179 |
---|
199999 | 3 | 4350 3878 3268 1699 6909 5505 2376 2465 6088 2... | 2098 |
---|
200000 rows × 3 columns
train_df['text_len'].describe()
count 200000.000000
mean 907.207110
std 996.029036
min 2.000000
25% 374.000000
50% 676.000000
75% 1131.000000
max 57921.000000
Name: text_len, dtype: float64
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist(train_df['text_len'], bins=200)
plt.xlabel('Text char count')
plt.title("Histogram of char count")
Text(0.5, 1.0, 'Histogram of char count')
train_df['label'].value_counts().plot(kind='bar')
plt.title('News class count')
plt.xlabel("category")
Text(0.5, 0, 'category')
from collections import Counter
all_lines = ' '.join(list(train_df['text']))
word_count = Counter(all_lines.split(" "))
word_count = sorted(word_count.items(), key=lambda d:d[1], reverse = True)
print(len(word_count))
print(word_count[0])
print(word_count[-1])
train_df['text_unique'] = train_df['text'].apply(lambda x: ' '.join(list(set(x.split(' ')))))
all_lines = ' '.join(list(train_df['text_unique']))
word_count = Counter(all_lines.split(" "))
word_count = sorted(word_count.items(), key=lambda d:int(d[1]), reverse = True)
print(word_count[0])
print(word_count[1])
print(word_count[2])
本章作业
- 假设字符3750,字符900和字符648是句子的标点符号,请分析赛题每篇新闻平均由多少个句子构成?
- 统计每类新闻中出现次数对多的字符
seq=[]
num=0
for line in train_df['text']:
for word in line.split(' '):
if word in ['3750','900','648']:
num+=1
seq.append(num)
num=0
mean_seq=sum(seq)/len(seq)
mean_seq
78.34829
train_df['text_unique'] = train_df['text'].apply(lambda x: ' '.join(list(set(x.split(' ')))))
train_df['label'].unique()
array([ 2, 11, 3, 9, 10, 12, 0, 7, 4, 1, 6, 5, 8, 13],
dtype=int64)
sequence=[]
for i in train_df['label'].unique():
res=' '.join(train_df[train_df['label']==i]['text_unique'])
sequence.append(res)
m=0
dic={}
word=''
r=[]
for i in sequence:
for j in i.split(' '):
dic[j]=dic.setdefault(j,0)+1
if (dic[j]>m) and (j not in['3750','900','648']):
word=j
m=dic[j]
r.append(word)
m=0
dic={}
word=''
r
['7399',
'6122',
'2465',
'7399',
'885',
'3370',
'2465',
'3370',
'4853',
'3370',
'5620',
'6122',
'6122',
'2662']
for i in range(len(r)):
print(train_df['label'].unique()[i],r[i])
2 7399
11 6122
3 2465
9 7399
10 885
12 3370
0 2465
7 3370
4 4853
1 3370
6 5620
5 6122
8 6122
13 2662