目的:
1.学习使用Pandas读取数据
2.分析赛题数据的分布规律
读取数据
import pandas as pd
import matplotlib as plt
import matplotlib.pyplot as plt
from collections import Counter
#使用pandas从csv文件中读取数据 sep:每列分割的字符 nrows:读取的行数
train_df= pd.read_csv('D:\PycharmProjects\\nlp_start\data\\train_set.csv',sep='\t',nrows=1000)
print(train_df.head())#读取好的数据为表格形式,第一列为新闻类别,第二列为新闻字符
结果:
句子长度分析
train_df['text_len'] = train_df['text'].apply(lambda x: len(x.split(' ')))
print(train_df['text_len'].describe())
#将句子长度绘制成直方图
_ = plt.hist(train_df['text_len'],bins=200)
plt.xlabel('Text char count')
plt.title("Histogram of char count")
plt.show()
结果:
新闻类别分布
train_df['label'].value_counts().plot(kind='bar')
plt.title('News class count')
plt.xlabel("category")
plt.show()
结果:
字符分布统计
#字符分布统计 首先将训练集中所有句子进行拼接进而划分为字符,并统计每个字符的个数
all_lines = ' '.join(list(train_df['text']))
word_count = Counter(all_lines.split(" "))
word_count = sorted(word_count.items(), key=lambda d:d[1], reverse = True)
print(len(word_count))
print(word_count[0])
print(word_count[-1])
#根据字在每个句子的出现情况,反推出标点符号。下面代码统计了不同字符在句子中出现的次数,其中字符3750,字符900和字符648
#在20万新闻的覆盖率接近99%,很有可能是标点符号
train_df['text_unique'] = train_df['text'].apply(lambda x: ' '.join(list(set(x.split(' ')))))
all_lines = ' '.join(list(train_df['text_unique']))
word_count = Counter(all_lines.split(" "))
word_count = sorted(word_count.items(), key=lambda d:int(d[1]),reverse=True)
print(word_count[0])
print(word_count[1])
print(word_count[2])
结果:
本章作业
1.假设字符3750,字符900和字符648是句子的标点符号,请分析赛题每篇新闻平均由多少个句子构成?
2.统计每类新闻中出现次数对多的字符
#假设字符3750,字符900和字符648是句子的标点符号,请分析赛题每篇新闻平均由多少个句子构成?
all_lines = ' '.join(list(train_df['text']))
count_3750 = all_lines.count("3750")
count_900 = all_lines.count("900")
count_648 = all_lines.count("648")
count_lines = (count_648+count_900+count_3750)/1000
print(count_lines)
#统计每类新闻中出现次数对多的字符
dic = {}
for i in range(len(train_df['label'])):
if train_df['label'][i] not in dic:
dic[train_df['label'][i]] = train_df['text'][i]
else:
dic[train_df['label'][i]] = dic[train_df['label'][i]] + ' ' + train_df['text'][i]
for key in dic.keys():
word_count = Counter(dic[key].split(" "))
word_count = sorted(word_count.items(), key=lambda d: (d[1]), reverse=True)
print(key, word_count[0])
结果: