python新闻推荐系统_python 新闻推荐系统(基于新闻内容推荐)

# -*- coding:utf-8 -*-

__version__ = ‘1.0.0.0‘

"""

@brief : 基于新闻的内容推荐系统

@details: 详细信息

@author : zhphuang

@date : 2019-08-07

"""

import jieba

from pandas import *

from sklearn.metrics import pairwise_distances

from bs4 import BeautifulSoup

from sklearn import feature_extraction

from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

from mysql_util import MySqlUtils

class ContentRecommend(object):

def __init__(self):

DATABASE_CONFIG = {

"host": "127.0.0.1",

"port": 3306,

"user": "root",

"passwd": "root",

"db": "recommend",

"charset": "utf8",

}

self.con = MySqlUtils(DATABASE_CONFIG)

self.recommend_count = 10

self.all_count = 1000

self.reload_data()

def reload_data(self):

corpus = []

sql = "select a.news_id,a.content,b.source_url as url from news_content as a left join news as b on a.news_id=b.news_id"

self.df = DataFrame(list(self.con.query(sql)), columns=[‘news_id‘, ‘content‘, ‘url‘])

zhPattern = re.compile(u‘[\u4e00-\u9fa5]+‘)

for index, row in self.df[0:self.all_count].iterrows():

print(index)

content = row[‘content‘]

bs = BeautifulSoup(content, "html.parser")

segments = []

segs = jieba.cut(bs.text)

for seg in segs:

if zhPattern.search(seg):

segments.append(seg)

corpus.append(‘ ‘.join(segments))

vectorizer = TfidfVectorizer() # 该类会统计每个词语的tf-idf权值

tfidf = vectorizer.fit_transform(corpus) # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵

words = vectorizer.get_feature_names()

print(len(words))

print(words)

print(vectorizer.vocabulary_)

a = tfidf.todense()

print(a)

self.distance_matrix = pairwise_distances(

tfidf,

metric=‘cosine‘

)

def analysis(self):

for index, item in enumerate(self.distance_matrix):

#min_value = np.min(np.delete(item, index))

#min_index = np.argmin(np.delete(item, index))

b = np.argsort(item)[1:self.recommend_count]

print("="*10 + "与%s相似的文章有:" % self.df.iloc[index, 0] + "="*10 )

for index_2 in b:

print(self.df.iloc[index_2, 0], "余弦距离:%s" % item[index_2])

def get_similar_news_id(self, news_id):

for index, item in enumerate(self.distance_matrix):

if self.df.iloc[index, 0] == news_id:

b = np.argsort(item)[1:self.recommend_count]

print("="*10 + "与%s相似的文章有:" % self.df.iloc[index, 0] + "(链接地址:%s)" % self.df.iloc[index, 2] + "="*10 )

for index_2 in b:

print(self.df.iloc[index_2, 0], "余弦相似度:%s" % item[index_2], "链接地址:%s" % self.df.iloc[index_2, 2])

def get_recommend_news_by_uid(self, uid):

pass

def input_news_id(self):

while 1:

print("输入newsId:")

news_id = input()

self.get_similar_news_id(news_id)

if __name__ == ‘__main__‘:

ContentRecommend().input_news_id()

原文地址:https://www.cnblogs.com/niuniuc/p/11314661.html

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值