系列目录
功能介绍
通过新闻采集器采集到的新闻原数据,其中包括新闻标题、新闻日期、新闻正文、新闻图片、新闻视频等。通过采集到的新闻数据进行以下操作:
- 关键词分析
- 新闻相似度计算
- 热词分析
- 热度分析
一、结构
新闻爬虫包括两部分:URL采集器、详情页采集器、定时器
分析器 | 功能 |
---|---|
关键词分析 | 将采集到的新闻内容(正文+标题)进行关键词分析 |
新闻相似度计算 | 将关键词分析获得的新闻关键词,再通过每个新闻之间进行关键词与或运算获得新闻相似度 |
热词分析 | 通过关键词分析得出的关键词进行统计,得出出现频率较高的词汇作为热词 |
热度分析 | 对新闻的阅读量、评论量、距今时间差进行计算新闻的热度 |
二、具体实现
1.关键词分析
# -*- coding: utf-8 -*-
'''
Author: x
Desc:
代码11-2 不同语料库下的新闻关键词抽取-基于TFIDF
'''
import logging
from logging.handlers import TimedRotatingFileHandler
import pymysql
import jieba.analyse
from Spider.settings import DB_HOST, DB_USER, DB_PASSWD, DB_NAME, DB_PORT
logger = logging.getLogger(__name__)
logger.setLevel(level=logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)-7s - %(message)s')
# 2. 初始化handler,并配置formater
log_file_handler = TimedRotatingFileHandler(filename="Recommend/analysis/kwg.log",
when="S", interval=5,
backupCount=20)
log_file_handler.setFormatter(formatter)
# 3. 向logger对象中添加handler
logger.addHandler(log_file_handler)
class SelectKeyWord:
def __init__(self, _type):
self._type = _type
self.db = self.connection()
self.cursor = self.db.cursor()
self.news_dict = self.loadData()
self.key_words = self.getKeyWords()
def connection(self):
'''
@Description:数据库连接
@:param host --> 数据库链接
@:param user --> 用户名
@:param password --> 密码
@:param database --> 数据库名
@:param port --> 端口号
@:param charset --> 编码
'''
db = pymysql.Connect(host=DB_HOST, user=DB_USER, password=DB_PASSWD, database=DB_NAME, port=DB_PORT,
charset='utf8')
# db = pymysql.connections.Connection.connect(DB_HOST, DB_USER, DB_PASSWD, DB_NAME, DB_PORT, charset='utf8')
return db
def loadData(self):
'''
@Description:加载数据
@:param None
'''
news_dict = dict()
table = self.getDataFromDB()
# 遍历每一行
# for row in range(1, table.nrows):
for row in range(len(table)):
line = table[row]
news_id = int(line[0])
news_dict.setdefault(news_id, {})
news_dict[news_id]["tag"] = line[0]
news_dict[news_id]["title"] = line[2]
news_dict[news_id]["content"] = line[6]
return news_dict
def getDataFromDB(self):
'''
@Description:从数据库获取数据
@:param None
'''
logger.info("从数据库获取数据")
sql_s = "select * from news_api_newsdetail"
try:
self.cursor.execute(sql_s)
message = self.cursor.fetchall()
except:
self.db.rollback()
return message
# 调用结巴分词获取每篇文章的关键词
def getKeyWords(self):
'''
@Description:通过jieba提取关键词TF-IDF算法
@:param _type --> 选择提取内容(标题提取、标题+内容提取)
'''
news_key_words = list()
# 加载停用词表
stop_words_list = [line.strip() for line in open("Recommend/stopwords/stop_words.txt", 'r').readlines()]
for new_id in self.news_dict.keys():
if self._type == 1:
# allowPOS 提取地名、名词、动名词、动词
keywords = jieba.analyse.extract_tags(
self.news_dict[new_id]["title"] + self.news_dict[new_id]["content"],
topK=10,
withWeight=False,
allowPOS=('ns', 'n', 'vn', 'rn', 'nz')
)
news_key_words.append(str(new_id) + '\t' + ",".join(keywords))
sql_i = 'update news_api_newsdetail set keywords=\"%s\" where news_id=%d' % (",".join(kws), new_id)
try:
self.cursor.execute(sql_i)
self.db.commit()
except Exception:
logger.error("Error:KeyWords update Error!!")
self.db.rollback()
elif self._type == 2:
# cut_all :False 表示精确模式
# keywords = jieba.cut(self.news_dict[new_id]["content"], cut_all=False)
keywords = jieba.analyse.extract_tags(
self.news_dict[new_id]["title"] + self.news_dict[new_id]["content"],
topK=10,
withWeight=False,
allowPOS=('ns', 'n', 'vn', 'rn', 'nz')
)
kws = list()
for kw in keywords:
if kw not in stop_words_list and kw != " " and kw != " ":
kws.append(kw)
logger.info("keyword:{}".format(kw))
news_key_words.append(str(new_id) + '\t' + ",".join(kws))
sql_i = 'update news_api_newsdetail set keywords=\"%s\" where news_id=%d' % (",".join(kws), new_id)
try:
self.cursor.execute(sql_i)
self.db.commit()
except Exception:
logger.error("Error:KeyWords update Error!!")
self.db.rollback()
else:
logger.error("请指定获取关键词的方法类型<1:TF-IDF 2:标题分词法>")
return news_key_words
def writeToFile(self):
'''
@Description:将关键词获取结果写入文件
@:param None
'''
fw = open("Recommend/data/keywords/1.txt", "w", encoding="utf-8")
fw.write("\n".join(self.key_words))
fw.close()
def splitTxt():
source_dir = 'Recommend/data/keywords/1.txt'
target_dir = 'Recommend/data/keywords/split/'
# 计数器
flag = 0
# 文件名
name = 1
# 存放数据
dataList = []
with open(source_dir, 'rb') as f_source:
for line in f_source:
flag += 1
dataList.append(line)
if flag == 200:
with open(target_dir + "pass_" + str(name) + ".txt", 'wb+') as f_target:
for data in dataList:
f_target.write(data)
name += 1
flag = 0
dataList = []
# 处理最后一批行数少于200万行的
with open(target_dir + "pass_" + str(name) + ".txt", 'wb+') as f_target:
for data in dataList:
f_target.write(data)
def beginSelectKeyWord(_type):
skw = SelectKeyWord(_type=_type)
skw.writeToFile()
# print("\n关键词获取完毕,数据写入路径 Recommend/data/keywords")
logger.info("关键词获取完毕,数据写入路径 Recommend/data/keywords")
2.新闻相似度计算
'''
Author: Zeng
Desc:
代码11-3 每个类型下新闻的相似度计算
'''
import logging
import os
from logging.handlers import TimedRotatingFileHandler
import pymysql
from Spider.settings import DB_HOST, DB_USER, DB_PASSWD, DB_NAME, DB_PORT
logger = logging.getLogger(__name__)
logger.setLevel(level=logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)-7s - %(message)s')
# 2. 初始化handler,并配置formater
log_file_handler = TimedRotatingFileHandler(filename="Recommend/analysis/ccg.log",
when="S", interval=5,
backupCount=20)
log_file_handler.setFormatter(formatter)
# 3. 向logger对象中添加handler
logger.addHandler(log_file_handler)
class Correlation:
def __init__(self, file):
# self.db = self.connection()
# self.cursor = self.db.cursor()
self.file = file
self.news_tags = self.loadData()
self.news_cor_list = self.getCorrelation()
# 连接mysql数据库
def connection(self):
'''
@Description:数据库连接
@:param host --> 数据库链接
@:param user --> 用户名
@:param password --> 密码
@:param database --> 数据库名
@:param port --> 端口号
@:param charset --> 编码
'''
db = pymysql.Connect(host=DB_HOST, user=DB_USER, password=DB_PASSWD, database=DB_NAME, port=DB_PORT,
charset='utf8')
# db = pymysql.connections.Connection.connect(DB_HOST, DB_USER, DB_PASSWD, DB_NAME, DB_PORT, charset='utf8')
return db
# 加载数据
def loadData(self):
'''
@Description:加载关键词分析结果文件
@:param None
'''
# print("开始加载文件数据:%s" % self.file)
news_tags = dict()
for line in open(self.file, "r", encoding="utf-8").readlines():
try:
newid, newtags = line.strip().split("\t")
news_tags[newid] = newtags
except:
print("读取分词数据过程中出现错误,错误行为:{}".format(line))
logger.error("Error:{}".format(line))
pass
return news_tags
def getCorrelation(self):
'''
@Description:计算相关度
@:param None
'''
news_cor_list = list()
for newid1 in self.news_tags.keys():
id1_tags = set(self.news_tags[newid1].split(","))
for newid2 in self.news_tags.keys():
id2_tags = set(self.news_tags[newid2].split(","))
if newid1 != newid2:
# print(newid1 + "\t" + newid2 + "\t" + str(id1_tags & id2_tags))
cor = (len(id1_tags & id2_tags)) / len(id1_tags | id2_tags)
if cor > 0.0:
news_cor_list.append([newid1, newid2, format(cor, ".2f")])
logger.info("news_cor_list.append:{}".format([newid1, newid2, format(cor, ".2f")]))
return news_cor_list
def writeToMySQL(self):
'''
@Description:将相似度数据写入数据库
@:param None
'''
db = pymysql.Connect(host=DB_HOST, user=DB_USER, password=DB_PASSWD, database=DB_NAME, port=DB_PORT,
charset='utf8')
for row in self.news_cor_list:
sql_w = "insert into news_api_newssimilar( new_id_base,new_id_sim,new_correlation ) values(%s, %s ,%s)" % (
row[0], row[1], row[2])
try:
cur = db.cursor()
cur.execute(sql_w)
db.commit()
except:
print("rollback", row)
logger.error("rollback:{}".format(row))
print("相似度数据写入数据库:newsrec.newsim")
def beginCorrelation():
'''
@Description:启动相似度分析
@:param None
'''
original_data_path = "Recommend/data/keywords/"
files = os.listdir(original_data_path)
for file in files:
# print("开始计算文件 %s 下的新闻相关度。" % file)
cor = Correlation(original_data_path + file)
cor.writeToMySQL()
# print("\n相关度计算完毕,数据写入路径 z-othersd/data/correlation")
3. 热词分析
# -- coding: utf-8 --
'''
Author: Zeng
Desc:
3-19 使用分析出的KeyWord进行统计,获取热词
'''
import logging
import os
from logging.handlers import TimedRotatingFileHandler
import pymysql
from Spider.settings import DB_HOST, DB_USER, DB_PASSWD, DB_NAME, DB_PORT
logger = logging.getLogger(__name__)
logger.setLevel(level=logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)-7s - %(message)s')
# 2. 初始化handler,并配置formater
log_file_handler = TimedRotatingFileHandler(filename="Recommend/analysis/hwg.log",
when="S", interval=5,
backupCount=20)
log_file_handler.setFormatter(formatter)
# 3. 向logger对象中添加handler
logger.addHandler(log_file_handler)
class HotWordLibrary():
def __init__(self, file):
self.file = file
self.db = self.connect()
self.cursor = self.db.cursor()
self.news_tags = self.loadFileData()
self.result = self.StatisticalHotKey()
self.writeresult = self.writeToMySQL()
def connect(self):
'''
@Description:数据库连接
@:param host --> 数据库链接
@:param user --> 用户名
@:param password --> 密码
@:param database --> 数据库名
@:param port --> 端口号
@:param charset --> 编码
'''
db = pymysql.Connect(host=DB_HOST, user=DB_USER, password=DB_PASSWD, database=DB_NAME, port=DB_PORT,
charset='utf8')
# db = pymysql.connections.Connection.connect(DB_HOST, DB_USER, DB_PASSWD, DB_NAME, DB_PORT, charset='utf8')
return db
def loadFileData(self):
'''
@Description:加载关键词分析结果文件
@:param None
'''
logger.info("开始加载文件数据:{}".format(self.file))
news_tags = dict()
for line in open(self.file, "r", encoding="utf-8").readlines():
try:
newid, newtags = line.strip().split("\t")
news_tags[newid] = newtags
except Exception:
logger.info("读取分词数据过程中出现错误,错误行为:{}".format(line))
pass
return news_tags
def StatisticalHotKey(self):
'''
@Description:统计热词
@:param None
'''
hot_word_list = dict()
for newsid in self.news_tags:
newstags = set(self.news_tags[newsid].split(","))
for keyword in newstags:
if (hot_word_list.get(keyword) != None):
hot_word_list.update({keyword: hot_word_list.get(keyword) + 1})
else:
hot_word_list[keyword] = 1
# print(hot_word_list)
return hot_word_list
def writeToMySQL(self):
'''
@Description:统计热词结果写入数据库
@:param None
'''
logger.info("将数据写入数据库...")
sql_t = "truncate table news_api_hotword"
try:
self.cursor.execute(sql_t)
self.db.commit()
except Exception:
self.db.rollback()
for word in self.result:
if (self.result.get(word) > 1):
sql_i = 'insert into news_api_hotword(hotword, num) values ("%s", %s)' % (word, self.result.get(word))
try:
self.cursor.execute(sql_i)
self.db.commit()
except Exception:
logger.error("rollback:{}".format(word))
self.db.rollback()
logger.info("推荐内容数据写入完成....")
return 1
def beginHotWordLibrary():
'''
@Description:启动热词统计
@:param None
'''
original_data_path = "Recommend/data/keywords/"
files = os.listdir(original_data_path)
for file in files:
# print("开始统计文件 %s 下的热词。" % file)
cor = HotWordLibrary(original_data_path + file)
cor.writeToMySQL()
# print("\n统计热词完毕")
4. 热度分析
'''
Author:Z
Desc:新闻的热度值计算,并写入数据库
'''
import logging
from datetime import datetime
from logging.handlers import TimedRotatingFileHandler
import pymysql
from Spider.settings import DB_HOST, DB_USER, DB_PASSWD, DB_NAME, DB_PORT
logger = logging.getLogger(__name__)
logger.setLevel(level=logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)-7s - %(message)s')
# 2. 初始化handler,并配置formater
log_file_handler = TimedRotatingFileHandler(filename="Recommend/analysis/hvg.log",
when="S", interval=5,
backupCount=20)
log_file_handler.setFormatter(formatter)
# 3. 向logger对象中添加handler
logger.addHandler(log_file_handler)
class CalHotValue:
def __init__(self):
self.db = self.connect()
self.cursor = self.db.cursor()
self.result = self.calHotValue()
# 连接mysql数据库
def connect(self):
'''
@Description:数据库连接
@:param host --> 数据库链接
@:param user --> 用户名
@:param password --> 密码
@:param database --> 数据库名
@:param port --> 端口号
@:param charset --> 编码
'''
db = pymysql.Connect(host=DB_HOST, user=DB_USER, password=DB_PASSWD, database=DB_NAME, port=DB_PORT,
charset='utf8')
return db
def calHotValue(self):
'''
@Description:计算热度值
@:param None
'''
base_time = datetime.now()
sql = "select news_id, category, readnum , comments, date from news_api_newsdetail"
self.cursor.execute(sql)
result_list = self.cursor.fetchall()
result = list()
for row in result_list:
try:
time = row[4].replace("年", "-").replace("月", "-").replace("日", " ")
print(time)
# diff = base_time - datetime.strptime(row[4], '%Y{y}%m{m}%d{d} %H:%M').__format__(y='年', m='月', d='日')
diff = base_time - datetime.strptime(str(time), '%Y-%m-%d %H:%M')
# print(f"base_time:{base_time}, diff:{diff}")
hot_value = row[2] * 0.4 + row[3] * 0.5 - diff.days * 0.1
logger.info("HotValue:{}".format(hot_value))
result.append((row[0], row[1], format(hot_value, ".2f")))
except Exception:
logger.error("转换出错")
logger.info("新闻热度值计算完毕,返回结果 ...")
return result
def writeToMySQL(self):
'''
@Description:将热度值写入数据库
@:param None
'''
for row in self.result:
sql_w = "insert into news_api_newshot( news_id,category,news_hot ) values(%s, %s ,%s)" % (
row[0], row[1], row[2])
try:
# 执行sql语句
self.cursor.execute(sql_w)
# 提交到数据库执行
self.db.commit()
except Exception:
logger.error("rollback:{}".format(row))
print("rollback", row)
# 发生错误时回滚
logger.info("热度数据写入数据库:news.newshot")
def beginCalHotValue():
'''
@Description:开始计算新闻的热度值
@:param None
'''
logger.info("开始计算新闻的热度值 ...")
chv = CalHotValue()
chv.writeToMySQL()
总结
分析系统主要是运用关键词得出的数据进行计算和统计得出不同的结果,算法都相对简单,并没有很复杂和高大上的算法,理解起来相对简单,并且分析过程中并没有用到用户的操作数据,因此算法的考虑面相对狭隘了。
项目完整的源码已更新,有需要的可以自行下载😀
欢迎提交问题和错误
个人码云主页,欢迎交流!!
个人GitHub主页,欢迎交流!!