我们使用了两种提取方式
1 .词频统计
2. 关键字提取
关键字提取的方式效果更好一些
第一步:数据读取
#读入数据,属性命名为['category','theme','URL','content'] df_new = pd.read_table('./data/val.txt', names=['category','theme','URL','content'], encoding='utf-8') df_new.dropna() #去除为空的数据 print(df_new.head())
第二步:数据预处理,把每一行的内容拆分成一个个词
#把df_new的content的值转换为列表形式 content = df_new.content.values.tolist() #将每一行拆分成一个个词语 content_S = [] for line in content: current_segement = jieba.lcut(line) if len(current_segement) > 1 and current_segement != '\r\n': content_S.append(current_segement) print(content_S[1000])
第三步: 与停用词库进行比对,去除内容中的停用词
‘
#形成一个字典类的容器 df_content = pd.DataFrame({'content_S':content_S}) print(df_content.head()) stopwords=pd.read_csv("stopwords.txt",index_col=False,sep="\t",quoting=3,names=['stopword'], encoding='utf-8') # 与停用表进行比对,如果存在就去除 def drop_stopwords(content, stopwords): content_clean = [] all_words = [] #用于统计词频 for line in content: line_clean = [] for words in line: if words in stopwords: continue line_clean.append(words) all_words.append(str(words)) content_clean.append(line_clean) return content_clean, all_words #把df_content的值变成一个列表 content = df_content.content_S.values.tolist() stopwords = stopwords.stopword.values.tolist() content_clean, all_words = drop_stopwords(content, stopwords) #构建一个新的字典用于储存去除停用表的内容 df_content = pd.DataFrame({'content_clean' : content_clean})
第四步构建模型,这里的数据我们需要做一步‘ ’.join的重连接,对于分类标签需要转换为数字类型
#基于贝叶斯进行文本分类 #创建字典,X为内容, y为种类 df_train = pd.DataFrame({'content_clean':content_clean, 'label':df_new['category']}) #查看y一共有几个类别 print(df_train.label.unique()) #为了方便计算,把对应的label字符类型转换为数字 label_mapping = {"汽车": 1, "财经": 2, "科技": 3, "健康": 4, "体育":5, "教育": 6,"文化": 7,"军事": 8,"娱乐": 9,"时尚": 0} df_train['label'] = df_train['label'].map(label_mapping) # 将数据分割为训练集和测试集 from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(df_train['content_clean'].values, df_train['label'].values, random_state=1) #将样本组合成带空格的字符串["dog cat fish","dog cat cat","fish bird", 'bird'] words = [] for line in x_train: try: words.append(' '.join(line)) except: print(line) print(words[0]) from sklearn.feature_extraction.text import CountVectorizer #构建对应的词汇统计表 vec = CountVectorizer(analyzer='word', max_features=4000, lowercase = False) vec.fit(words) #构建贝叶斯的训练模型 from sklearn.naive_bayes import MultinomialNB classifier = MultinomialNB() classifier.fit(vec.transform(words), y_train) #构建test_words的字符串 test_words = [] for line_index in range(len(x_test)): try: #x_train[line_index][word_index] = str(x_train[line_index][word_index]) test_words.append(' '.join(x_test[line_index])) except: print (line_index) #最后的训练结果 classifier.score(vec.transform(test_words), y_test)
第6步,使用关键字提取,查看分类结果,比对以后,发现效果要更好一些
from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(analyzer='word', max_features=4000, lowercase = False) vectorizer.fit(words) from sklearn.naive_bayes import MultinomialNB classifier = MultinomialNB() classifier.fit(vectorizer.transform(words), y_train) classifier.score(vectorizer.transform(test_words), y_test)
’