1.数据
import pandas as pd
import jieba
#数据(一小部分的新闻数据)
df_news = pd.read_table('val.txt',names=['category','theme','URL','content'],encoding='utf-8')
df_news = df_news.dropna() #直接丢弃包括NAN的整条数据
print(df_news.head())
2.分词:使用jieba库
import pandas as pd
import jieba #
#数据(一小部分的新闻数据)
df_news = pd.read_table('val.txt',names=['category','theme','URL','content'],encoding='utf-8')
df_news = df_news.dropna() #直接丢弃包括NAN的整条数据
#print(df_news.head())
#分词
content = df_news.content.values.tolist() #因为jieba要列表格式
print(content[1000])
print("------------------------------------------------------")
content_S = [] #存储分完词之后结果
for line in content:
current_segment = jieba.lcut(line) #jieba分词
if len(current_segment) > 1 and current_segment != "\r\n":
content_S.append(current_segment)
print(content_S[1000])
3.#将分完词的结果转化成DataFrame格式
import pandas as pd
import jieba #
#数据(一小部分的新闻数据)
df_news = pd.read_table('val.txt',names=['category','theme','URL','content'],encoding='utf-8')
df_news = df_news.dropna() #直接丢弃包括NAN的整条数据
#print(df_news.head())
#分词
content = df_news.content.values.tolist() #因为jieba要列表格式
#print(content[1000])
#print("------------------------------------------------------")
content_S = [] #存储分完词之后结果
for line in content:
current_segment = jieba.lcut(line) #jieba分词
if len(current_segment) > 1 and current_segment != "\r\n":
content_S.append(current_segment)
#print(content_S[1000])
#将分完词的结果转化成DataFrame格式
df_content = pd.DataFrame({"content_S":content_S})
print(df_content.head())
4.清洗数据(上面数据可以看到很乱),用停用词表清洗停用词
注:停用词(语料库中大量出现但是没什么用的词,比如“的”)
import pandas as pd
import jieba #分词
#去除停用词函数
def drop_stopwords(contents,stopwords):
contents_clean = []
all_words = []
for line in contents:
line_clean = []
for word in line:
if word in stopwords:
continue
line_clean.append(word)
all_words.append(str(word)) #所有的词组成一个列表
contents_clean.append(line_clean)
return contents_clean,all_words
#数据(一小部分的新闻数据)
df_news = pd.read_table('val.txt',names=['category','theme','URL','content'],encoding='utf-8')
df_news = df_news.dropna() #直接丢弃包括NAN的整条数据
#print(df_news.head())
#分词
content = df_news.content.values.tolist() #因为jieba要列表格式
#print(content[1000])
#print("------------------------------------------------------")
content_S = [] #存储分完词之后结果
for line in content:
current_segment = jieba.lcut(line) #jieba分词
if len(current_segment) > 1 and current_segment != "\r\n":
content_S.append(current_segment)
#print(content_S[1000])
#将分完词的结果转化成DataFrame格式
df_content = pd.DataFrame({"content_S":content_S})
#print(df_content.head())
#清洗乱的数据,用停用词表去除停用词
stopwords = pd.read_csv('stopwords.txt',index_col=False,sep='\t',quoting=3,names=["stopword"],encoding="utf-8") #读入停用词
print(stopwords.head(20))
print("-------------------------------------------------")
#调用去除停用词函数
contents = df_content.content_S.values.tolist()
stopwords = stopwords.stopword.values.tolist()
contents_clean,all_words = drop_stopwords(contents,stopwords)
#将清洗完的数据结果转化成DataFrame格式
df_content = pd.DataFrame({"contents_clean":contents_clean})
print(df_content.head())
5.用all_words统计词频
import numpy as np
import pandas as pd
import jieba #分词
#去除停用词函数
def drop_stopwords(contents,stopwords):
contents_clean = []
all_words = []
for line in contents:
line_clean = []
for word in li