简介
爬取学校新闻网站文章,包括标题,作者,日期,阅读量和文章内容,并储存在MySQL
思路
我用到的是requests库,先获取网页源码,并用pyquery提取所需信息,然后用pymysql存储到数据库中。
爬取问题与解决
在提取所需信息时,包括标题,作者,时间,文章都是可以直接在网页源码中截取的,但是在提取阅读量时,却发现是通过post方法获取的。
因为我找到了
因此我打算直接post,但是在找接口的时候,却找到了这个
$.ajax(
{type:'post',
url:'/interFace/getDocReadCount.do?id=506980',
timeout:2000,
success:function(ret)
{$('#readcount').html($.trim(ret))},
error:function(ret)
{$.ajax({
type:'post',
url:'/ecms_external?method=syncOfflineVisitCount',
data:'type=doc&requestUrl='+location.href,
timeout:2000,
success:function(ret){
$('#readcount').html(ret);},
error:function(){$('#readcount').html(0);
}});}});
我直接好家伙,因为我可以通访问第三行的地址获取到阅读量。
url:’/interFace/getDocReadCount.do?id=506980’,
根据过去经验,一般可以在网页header里找到id,但是纵览整个页面,包括请求头内,只有这一行有该文章的id,所以只能直接通过提取字符串来获取。
存储问题与解决
# 保存文章信息
def __save_art(self, head, content):
key = 'title' + ', author' + ', readcount' + ', time' + ', content' # 键名称
# 值内容
value = ' \'' + head["title"] + '\', \'' + head["author"] \
+ '\', \'' + head["readCount"] + '\', \'' \
+ head["time"] + '\', \'' + content + '\''
sqlInsert = 'INSERT INTO %s(%s) values(%s)' % (self.__table_name, key, value) # 提交内容
self.__cursor.execute(sqlInsert)
大部分文章都可以正常存储,在MySQL里,但个别文章中出现了单引号之类的符号,直接导致程序终止,SQLInsert语句语法错误,因此需要转义字符串内容。我使用的是
pymysql.escape_string(text)
修改后不再报错
def __save_art(self, head, content):
key = 'title' + ', author' + ', readcount' + ', time' + ', content' # 键名称
# 值内容
value = ' \'' + pymysql.escape_string(head["title"]) + '\', \'' + pymysql.escape_string(head["author"]) \
+ '\', \'' + head["readCount"] + '\', \'' \
+ head["time"] + '\', \'' + pymysql.escape_string(content) + '\''
sqlInsert = 'INSERT INTO %s(%s) values(%s)' % (self.__table_name, key, value) # 提交内容
self.__cursor.execute(sqlInsert)
代码
下面是代码,爬取范围是根据时间确定的:
import requests
import pymysql
import time
from pyquery import PyQuery as pq
class CatchFDYW(object):
__url_list = [] # 文章url
__db = '' # 数据库
__cursor = '' # 数据库游标
__is_open_db = 0 # 数据库是否打开
__maxPage = 50
def __init__(self):
self.__base_headers = { # header信息
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/'
'537.36 (KHTML, like Gecko) Chrome/79.0.3945.36 Safari/537.36',
}
self.__list_url = "http://news.fzu.edu.cn/html/fdyw/" # 要闻列表地址
self.__art_url = "http://news.fzu.edu.cn/" # 要闻文章地址
self.__RC_url = "http://news.fzu.edu.cn/interFace/getDocReadCount.do" # 阅读量提取地址
self.__year = 2019 # 爬取结束年月日
self.__month = 12
self.__day = 31
self.__table_name = "yw" # 数据库存储表名称
# 设置文章结束时间
def eTimeSet(self, y=2020, m=12, d=31):
self.__year = y
if m > 0:
self.__month = m
if d > 0:
self.__day = d
# GET方式获取网页源码
def __getHtml(self, url):
try:
response = requests.get(url=url, headers=self.__base_headers)
if response.status_code == 200:
return response.text
except requests.ConnectionError as e:
print("Error", e.args)
# 提取时间
def __getTime(self, t):
time = { # 默认时间
"year": 2020,
"month": 12,
"day": 31,
}
ti = t.split("-")
time["year"] = int(ti[0])
time["month"] = int(ti[1])
time["day"] = int(ti[2])
return time
# 比较文章时间,文章时间迟于或等于结束时间返回0,否则1
def __cmpTime(self, t):
if t["year"] > self.__year:
return 0
if t["year"] == self.__year:
if t["month"] > self.__month:
return 0
if t["month"] == self.__month:
if t["day"] >= self.__day:
return 0
return 1
# 判断是否结束爬取
def __isEnd(self, data):
if not data:
return 1
times = data(".list_time").text()
if self.__cmpTime(self.__getTime(times)):
return 1
# print(item)
return 0
# 获取文章列表url
def __getList(self):
page = 1
line = 1
url_end_part = ".html"
while page < self.__maxPage:
url = self.__list_url + str(page) + url_end_part
response = self.__getHtml(url)
list_data = pq(response)(".list_main_content ul")("li").items()
# print(list_data)
for item in list_data:
if self.__isEnd(item):
return
self.__url_list.append(item("a").attr("href"))
page = page + 1
# POST方式获取阅读量
def __getRC(self, id):
params = {
"id": id
}
try:
response = requests.post(url=self.__RC_url, params=params, headers=self.__base_headers)
if response.status_code == 200:
return response.text
except requests.ConnectionError as e:
print("Error", e.args)
# 获取文章头部信息->标题,时间,作者,阅读量
def __getHead(self, data):
title = data("p").text() # 标题
time = data("#fbsj").text() # 时间
author = data("#author").text() # 作者
url_id = data("div script").text()
start = url_id.find("do?id=") + 6
url_id = url_id[start:start + 6]
rc = self.__getRC(url_id) # 阅读量
detail = {
"title": title,
"time": time,
"author": author,
"readCount": rc
}
return detail
# 获取文章文本内容
def __getContent(self, data):
content = ""
for item in data("p").items():
content += item.text()
return content
# 爬取的主函数
def getArticle(self):
start = time.perf_counter() # 记录时间
self.__setDB() # 初始化数据库
self.__url_list.clear() # 清除记录
self.__getList() # 获取文章url
cnt = 0 # 记数
for art_url in self.__url_list: # 逐个访问文章
cnt += 1 # 计数加1
response = self.__getHtml(self.__art_url + art_url) # 获取网页信息
print("Getting article %d......" % cnt)
self.__save_art(self.__getHead(pq(response)(".detail_main_content")), # 提取信息并存储在数据库
self.__getContent(pq(response)("#news_content_display")))
self.__db.commit() # 更新数据库信息
self.__db.close() # 关闭数据库
print("Total:", cnt) # 统计显示
end = time.perf_counter() # 计时结束
print("Using time:", "%5.1f" % (end - start), "s") # 统计时间
# 数据库操作主函数
def __setDB(self):
if not self.__is_open_db or not self.__db.ping():
self.__set_database() # 创建数据库连接
self.__create_db() # 创建数据库
self.__create_table() # 创建表
self.__db.commit() # 更新数据库
# 创建数据库连接
def __set_database(self):
try:
self.__db = pymysql.connect(host='localhost', user='root', password='00929.', port=3306)
self.__cursor = self.__db.cursor()
self.__is_open_db = 1
except:
print("数据库打开失败")
# 创建数据库
def __create_db(self):
db_name = "fdyw2" # 数据库名称
sqlDbCrt = 'CREATE DATABASE IF NOT EXISTS %s' % (db_name) # 防止重复创建冲突
self.__cursor.execute(sqlDbCrt)
self.__cursor.execute('use %s' % (db_name))
return db_name
# 创建表
def __create_table(self):
tName = self.__table_name # 表名称
tList = ["id", "title", "author", "readcount", "time", "content"] # 键名称
tSize = ["int(20)", "varchar(255)", "varchar(255)", "int(20)", "date", "text"] # 值类型
tDetial = ["NOT NULL AUTO_INCREMENT", "NOT NULL", "NOT NULL", "NOT NULL", "NOT NULL", "NOT NULL"] # 值属性
addList = ""
for item in range(len(tList)):
addList = addList + tList[item] + ' '
addList = addList + tSize[item]
addList = addList + ' ' + tDetial[item] + ' '
addList = addList + ', '
sqlTbCrt = 'CREATE TABLE IF NOT EXISTS %s (%s PRIMARY KEY (id))' % (tName, addList) # 创建表
self.__cursor.execute(sqlTbCrt)
autoIncrease = 'ALTER TABLE %s AUTO_INCREMENT =1' % (tName)
self.__cursor.execute(autoIncrease)
# 保存文章信息
def __save_art(self, head, content):
key = 'title' + ', author' + ', readcount' + ', time' + ', content' # 键名称
# 值内容
value = ' \'' + pymysql.escape_string(head["title"]) + '\', \'' + pymysql.escape_string(head["author"]) \
+ '\', \'' + head["readCount"] + '\', \'' \
+ head["time"] + '\', \'' + pymysql.escape_string(content) + '\''
sqlInsert = 'INSERT INTO %s(%s) values(%s)' % (self.__table_name, key, value) # 提交内容
self.__cursor.execute(sqlInsert)
work = CatchFDYW() # 创建类
work.eTimeSet(2020, 11, 20) # 设置结束日期
work.getArticle() # working
work.eTimeSet(2020, 11, 26) # 设置结束日期
work.getArticle()