背景:利用爬虫,爬取网站页面广告元素,监控爬取元素的数目,定时发送监控邮件
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
'''
@xiayun
@896365105@qq.com
#爬取网站内容,利用phantomjs:IP代理+修改UA+动态页面执行JS
'''
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import urllib,urllib2
import smtplib
import re
from email.mime.text import MIMEText
from email.header import Header
import sys
def reptile():
global result, data
#proxy_ip.txt为IP代理池,可以自己爬IP,也可以买,不过都不稳定,
#需要在前面再加一个IP验证程序。
IPS = [i for i in open("./proxy_ip.txt", 'r').readline().split('\n') if i]
print IPS
for i in IPS:
service_args = []
service_args = ['--proxy-type=HTTP',]
IP_str = ''.join(i)
print IP_str
proxy_IP = '--proxy=%s' % IP_str
service_args.append(proxy_IP)
dcap = dict(DesiredCapabilities.PHANTOMJS)
#创建UA头
dcap["phantomjs.page.settings.userAgent"] = ('Mozilla/5.0 (baomihua@iPhone;
CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko)
Version/9.0 Mobile/13B143 Safari/601.1')
#利用phantomjs仿浏览器动作,参数2是代理IP
driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=service_args)
#设置访问超时时间
driver.implicitly_wait(60)
driver.set_page_load_timeout(60)
try:
driver.get('网页地址')
except:
print "timeout"
finally:
data = driver.page_source
time.sleep(20)
req = r"广告元素"
rule1 = re.compile(req)
lists = re.findall(rule1, data)
counts = len(lists)
print counts
# print data
driver.quit()
#判断广告元素是否为22
if counts == 22:
print "The webpage is OK!"
result = "The webpage is OK!Find 22 广告元素!
proxy_IP:%s " % IP_str
break
if counts != 22:
#IPS.remove(i)
print "%s is bad!" % i.strip()
result = "The webpage maybe bad"
print "close"
#返回结果和网页代码
return result, data
def send_mail(result,data):
receivers = ['XXX@XX.com'] #接收人
mail_host = 'smtp.exmail.qq.com' #代理邮箱smtp协议
mail_user = 'xxx@xxx.com' #发送人
mail_pass = 'xxxx' #密码
mail_postfix = 'xxxx' #发件箱的后缀
title = str(result)
msg = MIMEText(data, 'plain', 'utf-8') #文本格式内容
me = title.decode('utf-8') + "<" + mail_user + ">"
msg['Subject'] = Header(title, 'utf-8')
msg['From'] = Header(me, 'utf-8')
msg['To'] = Header(";".join(receivers), 'utf-8')
try:
s = smtplib.SMTP()
s.connect(mail_host)
s.login(mail_user, mail_pass)
s.sendmail(me,receivers , msg.as_string())
s.close()
print "发送成功"
return True
except smtplib.SMTPException:
print "Error: 无法发送邮件"
return False
if __name__ == '__main__':
while 1:
print 'start' + ' ' + ''.join(time.ctime(time.time()))
result, data = reptile()
send_mail(result=result, data=data)
print 'stop' + ' ' + ''.join(time.ctime(time.time()))
time.sleep(600)
sys.exit(0)
转载于:https://blog.51cto.com/linuxerxy/1893893