需求:
1.监控网站页面内的附件有无更新情况。
方案:
用数据库内存的附件链接与页面内的附件链接进行比对,无相同链接视为附件失效已更新。
代码:
import urllib2
import time
import pymysql
import random
from bs4 import BeautifulSoup
import re
这是引入的所需要的库,python版本是3.7有些是自带的,有些需要安装。
db = pymysql.connect('服务器ip', '数据库名', '*****', '*****', )
cursor = db.cursor()
sql = "SELECT * FROM Singapore WHERE (updateTime <> CURDATE() OR updateTime IS NULL OR flag = 0)" #查询需要监控的所有数据的sql
user_agents = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 BIDUBrowser/6.x Safari/537.31',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.44 Safari/537.36 OPR/24.0.1558.25 (Edition Next)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36 OPR/23.0.1522.60 (Edition Campaign 54)']
cursor.execute(sql)
results = cursor.fetchall()
for row in results:
print row[0]
url = row[3]
value = row[5]
link = row[4].split("/", 3)
links = '/' + link[3]
try:
user_agent = random.choice(user_agents)
myheader = {'User_Agent': user_agent}
req = urllib2.Request(url, headers=myheader)
up = urllib2.urlopen(req,timeout=40) #获取页面信息,防止反应时间过长限制40秒
cont = up.read() #读取返回页面信息
n = 0
#监控规则一
if row[9] == 1:
req = cont.split("\"")
for str_s in req:
if links in str_s:
n = n+1
#监控规则二
elif row[9] == 2:
soup = BeautifulSoup(cont, 'lxml')
trs = soup.find('p', 'updateddate')
pattern = re.compile(r'\d{1,2}.\d{1,2}.\d{4}')
res = re.findall(pattern,str(trs))
update_time = res[0]
# 转换成时间数组
time_arr = time.strptime(update_time, '%d/%m/%Y')
# 转换成新的时间格式
dt_new = time.strftime("%Y-%m-%d", time_arr)
if dt_new == row[5]:
n = n + 1
if n > 0:
updateSql = 'update Singapore set updateTime=CURDATE(),flag=2 where itemid=' + str(row[0])
else:
if row[9] == 2:
updateSql = 'update Singapore set newDate="' + str(dt_new) + '",oldDate="' + str(value) + '" ,updateTime=CURDATE(),flag=1 where itemid=' + str(row[0])
else:
updateSql = 'update Singapore set newDate=CURDATE(),oldDate="' + str(value) + '" ,updateTime=CURDATE(),flag=1 where itemid=' + str(row[0])
cursor.execute(updateSql)
db.commit()
except Exception as e:
print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
# 打印错误信息
print e
updateSql = 'update Singapore set flag=0 where itemid=' + str(row[0])
cursor.execute(updateSql)
db.commit()
continue
db.close()