import pandas as pd
读取数据
#文件比较大,先读取前一百行
train_df = pd.read_csv('./train_set.csv', sep='\t', nrows=100)
#前五行
train_df.head()
label | text | |
---|---|---|
0 | 2 | 2967 6758 339 2021 1854 3731 4109 3792 4149 15... |
1 | 11 | 4464 486 6352 5619 2465 4802 1452 3137 5778 54... |
2 | 3 | 7346 4068 5074 3747 5681 6093 1777 2226 7354 6... |
3 | 2 | 7159 948 4866 2109 5520 2490 211 3956 5520 549... |
4 | 3 | 3646 3055 3055 2490 4659 6065 3370 5814 2465 5... |
由标签列和text组成
print(train_df.head())
label text
0 2 2967 6758 339 2021 1854 3731 4109 3792 4149 15...
1 11 4464 486 6352 5619 2465 4802 1452 3137 5778 54...
2 3 7346 4068 5074 3747 5681 6093 1777 2226 7354 6...
3 2 7159 948 4866 2109 5520 2490 211 3956 5520 549...
4 3 3646 3055 3055 2490 4659 6065 3370 5814 2465 5...
数据分析
在读取完成数据集后,我们还可以对数据集进行数据分析的操作。虽然对于非结构数据并不需要做很多的数据分析,但通过数据分析还是可以找出一些规律的。
此步骤我们读取了所有的训练集数据,在此我们通过数据分析希望得出以下结论:
赛题数据中,新闻文本的长度是多少?
赛题数据的类别分布是怎么样的,哪些类别比较多?
赛题数据中,字符分布是怎么样的?
句子长度分析
在赛题数据中每行句子的字符使用空格进行隔开,所以可以直接统计单词的个数来得到每个句子的长度。统计并如下
train_df = pd.read_csv('./train_set.csv',sep='\t')
print(train_df['text'].iloc[0].split(" "))
['2967', '6758', '339', '2021', '1854', '3731', '4109', '3792', '4149', '1519', '2058', '3912', '2465', '2410', '1219', '6654', '7539', '264', '2456', '4811', '1292', '2109', '6905', '5520', '7058', '6045', '3634', '6591', '3530', '6508', '2465', '7044', '1519', '3659', '2073', '3750', '3731', '4109', '3792', '6831', '2614', '3370', '4269', '3370', '486', '5770', '4109', '4125', '3750', '5445', '2466', '6831', '6758', '3743', '3630', '1726', '2313', '5906', '826', '4516', '657', '900', '1871', '7044', '3750', '2967', '3731', '1757', '1939', '648', '2828', '4704', '7039', '3706', '3750', '965', '2490', '7399', '3743', '2145', '2407', '7451', '3775', '6017', '5998', '1641', '299', '4704', '2621', '7029', '3056', '6333', '433', '648', '1667', '1099', '900', '2289', '1099', '648', '5780', '220', '7044', '1279', '7426', '4269', '3750', '2967', '6758', '6631', '3099', '2205', '7305', '2620', '5977', '3750', '3329', '1793', '6666', '2042', '3193', '4149', '1519', '7039', '3706', '2446', '5399', '648', '4124', '2058', '3912', '248', '3193', '2252', '5649', '2212', '4939', '7239', '3310', '4525', '2400', '900', '5770', '4109', '4125', '7044', '4921', '265', '1397', '4699', '1699', '669', '6407', '3750', '1271', '1271', '4741', '669', '4659', '3870', '4030', '4167', '5338', '25', '3466', '6909', '4417', '1859', '3750', '1465', '7194', '648', '3938', '1571', '848', '6986', '827', '2124', '3750', '1991', '7444', '7037', '2729', '908', '6308', '3750', '1889', '6810', '4190', '591', '5598', '2289', '2109', '6831', '6407', '2400', '5410', '517', '900', '25', '3731', '4109', '3792', '4128', '1679', '4811', '4853', '4109', '3630', '6902', '6122', '1903', '1736', '3915', '2602', '6822', '3750', '6630', '4265', '591', '729', '4448', '648', '1465', '1401', '4853', '648', '5881', '6182', '4128', '1679', '4939', '2646', '652', '340', '7328', '1320', '900', '1460', '619', '5505', '2376', '4853', '3272', '3750', '4853', '4109', '3630', '6902', '3362', '2810', '3750', '803', '1985', '4128', '669', '19', '6508', '900', '1635', '1871', '7377', '6122', '6017', '3750', '2289', '1099', '3938', '1571', '7509', '1375', '5393', '5589', '5037', '2115', '4707', '5310', '6811', '6093', '900', '7399', '2410', '1219', '6654', '3263', '6017', '3750', '5998', '4939', '5971', '4148', '3750', '803', '1985', '7194', '4780', '796', '6038', '4231', '648', '1722', '6407', '3750', '1099', '6485', '1920', '1767', '5915', '6518', '6093', '5598', '5648', '4280', '900', '7326', '6242', '5328', '1214', '3870', '1985', '7194', '5998', '5741', '2115', '913', '5950', '3800', '1538', '686', '6734', '6017', '3750', '1985', '3659', '1324', '5814', '4998', '5176', '535', '7399', '307', '4068', '486', '1667', '1099', '2121', '6407', '3750', '7420', '3099', '6038', '4231', '4190', '1519', '3255', '7123', '4305', '3231', '1635', '4822', '1722', '3750', '2967', '3731', '1757', '1939', '648', '473', '6518', '2400', '2614', '5330', '5530', '1394', '4939', '1903', '7495', '7239', '900', '4469', '5530', '4704', '299', '7467', '2121', '669', '5693', '3750', '3618', '299', '5264', '4853', '1734', '316', '2828', '5445', '4190', '4939', '3484', '6043', '2376', '1031', '761', '900', '5370', '3782', '2210', '669', '2210', '3099', '1363', '6301', '3508', '1907', '2410', '7509', '5718', '541', '3750', '803', '2967', '6758', '3038', '6641', '1985', '7194', '512', '4811', '6811', '5243', '2112', '3750', '1734', '2376', '2891', '1211', '648', '7257', '4148', '7159', '1667', '3750', '5816', '4202', '2400', '5864', '3915', '7399', '3414', '1667', '5977', '7327', '7256', '2935', '4936', '1667', '2151', '900', '6831', '4599', '6182', '3227', '3859', '3099', '7509', '7256', '3750', '1985', '7194', '4128', '4691', '2029', '1344', '6630', '5598', '1465', '648', '3706', '7403', '543', '3038', '900', '1985', '7194', '3800', '980', '6017', '980', '4124', '648', '900', '1635', '3605', '5028', '3731', '4109', '3792', '1866', '3578', '3915', '648', '4939', '1335', '6666', '6560', '3750', '3618', '3508', '1907', '2410', '1913', '6656', '3750', '2828', '4704', '4998', '4939', '7039', '3915', '4167', '5338', '3750', '803', '1985', '4939', '3263', '7123', '264', '2456', '5689', '2109', '648', '3750', '6093', '1699', '5589', '4411', '1866', '4750', '648', '1667', '1099', '3000', '7420', '1279', '2975', '1141', '7148', '3750', '1985', '3915', '2570', '4936', '5998', '1877', '3000', '7420', '900', '1635', '5470', '2313', '5864', '641', '4333', '3750', '3915', '5659', '316', '2828', '2770', '5176', '803', '2047', '7532', '606', '6980', '1635', '3750', '803', '1750', '7039', '3800', '7245', '3099', '7509', '5839', '3750', '1866', '1401', '4321', '5788', '1519', '6122', '6405', '4939', '5998', '2729', '900', '1985', '7194', '5998', '2289', '2107', '1519', '1592', '316', '2828', '1679', '4811', '5461', '3324', '4525', '4052', '3750', '2212', '742', '3750', '1985', '7194', '6631', '1335', '5445', '3750', '1465', '7194', '4128', '6887', '4819', '5977', '3223', '2717', '900', '5612', '5948', '3750', '1985', '7194', '2289', '913', '3800', '4811', '6122', '2614', '2047', '7532', '606', '6980', '900', '1985', '2541', '4409', '3772', '6012', '1833', '5560', '4173', '6662', '414', '340', '316', '4125', '4128', '3800', '669', '6575', '4819', '5977', '900', '1635', '25', '1460', '619', '7044', '4921', '648', '4407', '3800', '1241', '600', '3750', '5470', '2313', '641', '4333', '7539', '803', '316', '4125', '648', '3530', '6637', '569', '1985', '3000', '4659', '5610', '6917', '3750', '3618', '1985', '6887', '7010', '3870', '900', '3915', '4939', '7010', '3870', '5598', '1985', '1394', '3397', '5598', '900', '1635', '1460', '619', '5708', '1335', '6518', '4148', '3750', '2410', '1219', '6654', '2252', '1702', '5598', '803', '4646', '2109', '6905', '5520', '1635', '2663', '885', '5491', '1465', '4822', '1722', '5011', '2376', '4149', '1903', '2662', '3750', '803', '316', '2828', '1767', '5915', '6065', '2042', '1335', '5598', '3750', '2688', '5598', '3231', '5780', '7399', '3750', '4811', '5788', '1292', '1641', '1667', '1099', '4811', '5393', '6407', '5708', '6631', '1335', '6666', '900', '316', '4125', '4811', '648', '4939', '6678', '3750', '2021', '1726', '340', '4469', '4842', '4128', '669', '5393', '4801', '3154', '3750', '5780', '7399', '669', '3915', '544', '62', '5602', '1913', '5598', '3750', '3859', '6759', '4939', '4646', '1913', '900', '1635', '1767', '5915', '6065', '4464', '5814', '648', '2410', '1219', '6654', '1815', '1699', '6038', '4231', '5698', '1375', '62', '307', '3750', '803', '299', '5264', '1460', '316', '2828', '5445', '3750', '1985', '3414', '1667', '7509', '3223', '3750', '5998', '4939', '669', '2364', '2975', '648', '900', '1985', '3038', '5938', '5168', '3770', '1667', '3750', '2717', '368', '5693', '7117', '3750', '1985', '2131', '6909', '2192', '1141', '6831', '6015', '900', '3864', '7194', '1375', '5393', '1815', '1699', '1985', '5780', '7399', '5681', '3099', '5176', '3870', '5598', '3750', '1985', '3038', '3771', '6630', '7159', '1667', '900', '1635', '5659', '7377', '3166', '5445', '3750', '1793', '6666', '648', '2614', '5736', '5537', '5526', '4128', '6887', '4811', '495', '6386', '900', '1465', '7194', '1767', '5659', '2410', '1219', '6654', '340', '1362', '1829', '2304', '3193', '6822', '3750', '5330', '5264', '4321', '3750', '4173', '5619', '4109', '6227', '648', '5915', '6515', '4893', '5957', '6043', '3750', '5949', '4411', '5410', '1991', '4128', '826', '2490', '3193', '2602', '3750', '803', '1985', '7194', '4516', '5264', '1394', '3800', '5659', '3731', '4109', '3792', '5081', '2918', '3750', '5051', '1985', '5612', '19', '3750', '3731', '4109', '3792', '5718', '7239', '3193', '6822', '900', '1635', '7377', '5736', '3750', '2205', '7305', '2620', '2042', '5192', '1745', '3605', '6887', '5278', '299', '648', '5651', '7440', '1656', '3630', '1702', '3300', '7539', '803', '1985', '340', '3731', '4109', '3792', '4190', '4811', '4464', '1519', '5778', '3166', '3750', '1985', '3038', '6235', '7399', '5998', '2313', '900', '1635', '25', '910', '619', '4939', '1613', '248', '3193', '4741', '4893', '3750', '2967', '3731', '1757', '1939', '648', '7495', '5028', '5949', '4939', '7539', '803', '4811', '2255', '3915', '3750', '1394', '4741', '900', '6887', '2255', '3915', '3750', '1394', '669', '4741', '900', '1635']
%pylab inline
#对一整列进行操作 lambda x:y 输入参数x,返回y
train_df['text_len'] = train_df['text'].apply(lambda x: len(x.split(' ')))
#.describe()返回pd数据的统计信息
print(train_df['text_len'].describe())
%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib
count 200000.000000
mean 907.207110
std 996.029036
min 2.000000
25% 374.000000
50% 676.000000
75% 1131.000000
max 57921.000000
Name: text_len, dtype: float64
#句子长度分布
_ = plt.hist(train_df['text_len'], bins=200)
plt.xlabel('Text char count')
plt.title("Histogram of char count")
Text(0.5, 1.0, 'Histogram of char count')
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-pRZGcCqB-1678715819407)(output_12_1.png)]
新闻类别分布
train_df['label'].value_counts().plot(kind='bar')
plt.title('News class count')
plt.xlabel("category")
Text(0.5, 0, 'category')
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-FyFAUv5t-1678715819407)(output_14_1.png)]
在数据集中标签的对应的关系如下:{‘科技’: 0, ‘股票’: 1, ‘体育’: 2, ‘娱乐’: 3, ‘时政’: 4, ‘社会’: 5, ‘教育’: 6, ‘财经’: 7, ‘家居’: 8, ‘游戏’: 9, ‘房产’: 10, ‘时尚’: 11, ‘彩票’: 12, ‘星座’: 13}
从统计结果可以看出,赛题的数据集类别分布存在较为不均匀的情况。在训练集中科技类新闻最多,其次是股票类新闻,最少的新闻是星座新闻。
字符分布统计
接下来可以统计每个字符出现的次数,首先可以将训练集中所有的句子进行拼接进而划分为字符,并统计每个字符的个数。
从统计结果中可以看出,在训练集中总共包括6869个字,其中编号3750的字出现的次数最多,编号3133的字出现的次数最少。
from collections import Counter
# Counter 快速统计类别的频次
#join() 方法用于将序列中的元素以指定的字符连接生成一个新的字符串。
#将train_df['text']转化为列表,并用空格连接
all_lines = ' '.join(list(train_df['text']))
#将句子以空格划分成字符,并统计字符的类别及出现频次;记录成一个词典
word_count = Counter(all_lines.split(" "))
#word_count.items() 获得元组列表;并对列表进行排序
word_count = sorted(word_count.items(), key=lambda x:x[1], reverse = True)
#iterable -- 可迭代对象。
# cmp -- 比较的函数,这个具有两个参数,参数的值都是从可迭代对象中取出,此函数必须遵守的规则为,大于则返回1,小于则返回-1,等于则返回0。
# key -- 主要是用来进行比较的元素,只有一个参数,具体的函数的参数就是取自于可迭代对象中,指定可迭代对象中的一个元素来进行排序。
# reverse -- 排序规则,reverse = True 降序 , reverse = False 升序(默认)。
print('字符类统计:',len(word_count))
print('频次最多的字符:',word_count[0])
print('频次最少的字符:',word_count[-1])
字符类统计: 6869
频次最多的字符: ('3750', 7482224)
频次最少的字符: ('3133', 1)
这里还可以根据字在每个句子的出现情况,反推出标点符号。下面代码统计了不同字符在句子中出现的次数,其中字符3750,字符900和字符648在20w新闻的覆盖率接近99%,很有可能是标点符号。
#先将每个句子划分为字符-》列表;再用set进行去重,list转化为列表,再用空格间隔
train_df['text_unique'] = train_df['text'].apply(lambda x: ' '.join(list(set(x.split(' ')))))
#将所有只含有不同字符的句子拼接 总共10万条句子
all_lines = ' '.join(list(train_df['text_unique']))
#统计字符的频次 获得词典
word_count = Counter(all_lines.split(" "))
#word_count.items() 获得元组列表;并对列表进行排序
word_count = sorted(word_count.items(), key=lambda d:int(d[1]), reverse = True)
print(word_count[0])
print(word_count[1])
print(word_count[2])
('3750', 197997)
('900', 197653)
('648', 191975)
a = 200000
print(word_count[0][1]/a)
print(word_count[1][1]/a)
print(word_count[2][1]/a)
0.989985
0.988265
0.959875
由上可知,在10000条句子中(去重),上面三个字符覆盖率达到96%以上,有可能是标点符号。
数据分析的结论
通过上述分析我们可以得出以下结论:
赛题中每个新闻包含的字符个数平均为1000个,还有一些新闻字符较长;
赛题中新闻类别分布不均匀,科技类新闻样本量接近4w,星座类新闻样本量不到1k;
赛题总共包括7000-8000个字符;
通过数据分析,我们还可以得出以下结论:
每个新闻平均字符个数较多,可能需要截断;
由于类别不均衡,会严重影响模型的精度;
作业
问题1
假设字符3750,字符900和字符648是句子的标点符号,请分析赛题每篇新闻平均由多少个句子构成?
#需要一列表格用来记录三个标点符号在每篇新闻中的频次,
#然后统计数据即可,以此来推断句子数量
#先将每个新闻划分成字符,之后统计每个字符的频次
train_df['num1'] = train_df['text'].apply(lambda x:Counter(x.split(" ")))
#由对应的词典中,取出三个字符的频次并相加
train_df['num2'] = train_df['num1'].apply(lambda x:x['3750']+x['900']+x['648'])
#对三字符频次之和的列进行统计描述
train_df['num2'].describe()
count 200000.000000
mean 78.348290
std 85.519746
min 0.000000
25% 27.000000
50% 55.000000
75% 101.000000
max 3417.000000
Name: num2, dtype: float64
平均每个篇新闻中,三种字符总共频次有78个,即由78个句子组成
问题2
统计每类新闻中出现次数对多的字符?
#将每一个类别的新闻。拼接成一个大新闻,划分字符,统计字符频次,排序,输出频次最多的字符
for i in range(14) :
#取标签为i的行
t = train_df[train_df.label==i]
#将其新闻文本拼接
all_lines0 = ' '.join(list(t['text']))
#统计频次最多的字符
word_dic = Counter(all_lines0.split(' '))
#对字符进行排序
word_sort = sorted(word_dic.items(),key=lambda x:int(x[1]),reverse=True)
print(f'第{i}类:',word_sort[0])
第0类: ('3750', 1267331)
第1类: ('3750', 1200686)
第2类: ('3750', 1458331)
第3类: ('3750', 774668)
第4类: ('3750', 360839)
第5类: ('3750', 715740)
第6类: ('3750', 469540)
第7类: ('3750', 428638)
第8类: ('3750', 242367)
第9类: ('3750', 178783)
第10类: ('3750', 180259)
第11类: ('3750', 83834)
第12类: ('3750', 87412)
第13类: ('3750', 33796)