python3.6 urllib.request库实现简单的网络爬虫、下载图片

#更新日志
#0418 爬取页面商品URL
#0421 更新 添加爬取下载页面图片功能
#0423 更新 添加发送邮件功能
# 优化 爬虫异常处理、错误页面及空页面处理
# 优化 爬虫关键字黑名单、白名单,提高效率

 

################################################################# 
#author: 陈月白
#_blogs: http://www.cnblogs.com/chenyuebai/
#################################################################

 1
# -*- coding: utf-8 -*- 2 import urllib.request 3 import sys 4 import traceback 5 import re 6 import socket 7 import smtplib 8 from email.mime.text import MIMEText 9 from email.utils import formataddr 10 11 socket.setdefaulttimeout(15.0) 12 13 class CRAWLER(): 14 #初始化变量 15 def __init__(self): 16 self.pageIndex = 0 17 self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' 18 self.headers = {'User-Agent':self.user_agent} 19 20 21 #传入网页链接,抓取页面html数据 22 def get_page(self,url,black_keyword_list=[],must_keyword_list=[]): 23 try: 24 page_data = "" 25 request = urllib.request.Request(url,headers=self.headers) 26 response = urllib.request.urlopen(request,timeout=10) 27 page_data = response.read().decode() 28 #print("####################\n",page_data) 29 except: 30 print("get_page %s catch a error,now return"%url) 31 #traceback.print_exc() 32 return page_data 33 34 #设置黑名单关键字过滤无效网页,即page_data中若存在该关键字则视为page_data为空 35 if black_keyword_list: 36 for black_keyword in black_keyword_list: 37 if black_keyword in page_data: 38 print("black_keyword =",black_keyword) 39 page_data = "" 40 return page_data 41 42 #设置page_data必须包含的关键字,若不含则视为page_data为空 43 if must_keyword_list: 44 for must_keyword in must_keyword_list: 45 if not must_keyword in page_data: 46 print("must_keyword: [%s] is not in page_data,now let page_data is empty!"%must_keyword) 47 page_data = "" 48 return page_data 49 50 if page_data == '': 51 print("EXEC:get_page(%s) failed,page_data is empty..."%url) 52 return page_data 53 else: 54 print("EXEC:get_page(%s)success!"%url) 55 return page_data 56 57 58 #入口,传入url,初步筛选信息 59 def select_items_from_url(self,url,flag,black_keyword_list=[],must_keyword_list=[]): 60 print("url =",url) 61 print("flag =",flag) 62 page_data = self.get_page(url,black_keyword_list,must_keyword_list) 63 #print("page_data =",page_data) 64 if page_data == "": 65 print("page_data =",page_data) 66 return page_data 67 #print("-----",page_data) 68 69 pattern = re.compile(flag,re.S) 70 print("pattern =",pattern) 71 items = re.findall(pattern,page_data) 72 #print("FUNC:select_items_from_url items =",items) 73 if items == "": 74 print("EXEC:select_items_from_url failed,select items is empty") 75 return items 76 else: 77 print("EXEC:select_items_from_url success!") 78 # print("items =",items) 79 return items 80 81 82 def load_image_by_imageUrl(self,image_url,image_load_fullpath): 83 print("image_url =",image_url) 84 print("image_load_fullpath =",image_load_fullpath) 85 # try: 86 # urllib.request.urlretrieve(image_url,image_load_fullpath) 87 # except: 88 # print("CATCH AN ERROR FUNC:load_image_by_imageUrl %s failed..."%image_url) 89 # return 90 try: 91 opener = urllib.request.build_opener() 92 opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36')] 93 urllib.request.install_opener(opener) 94 95 print("now start to load %s"%image_url) 96 urllib.request.urlretrieve(image_url,image_load_fullpath) 97 print("FUNC:load_image_by_imageUrl %s success!"%image_url) 98 except: 99 print("CATCH AN ERROR FUNC:load_image_by_imageUrl %s failed..."%image_url) 100 return 101 102 def start(self): 103 pass 104 105 def send_email(self,receive_user,topic_name,body_text): 106 send_user = "85********2@qq.com" 107 send_passwd = "********" 108 109 try: 110 msg = MIMEText(body_text,'plain','utf-8') #邮件内容 111 msg['From'] = formataddr(["********",send_user]) #收件人邮箱昵称、收件人邮箱账号 112 msg['Subject'] = topic_name #邮件主题s 113 114 server = smtplib.SMTP("smtp.qq.com",25) 115 server.set_debuglevel(1) 116 server.connect("smtp.qq.com") 117 server.ehlo() 118 server.starttls() 119 server.ehlo() 120 server.login(send_user,send_passwd) #登录 121 server.sendmail(send_user,receive_user,msg.as_string()) 122 server.quit() 123 print("send mail to %s success!"%receive_user) 124 except: 125 print("send mail to %s failed!"%receive_user) 126
127 130 #main 131 def get_goods_image_from_suning(): 132 suning = CRAWLER() 133 flag = '<img class="search-loading" width="220" height="220" src2="(.*?)"></a>' 134 page_num = 5 135 name_index = 0 136 try: 137 for i in range(page_num): 138 items = suning.select_items_from_url('https://list.suning.com/0-20006-%s.html'%i,flag) 139 #print(items) 140 if items == "": 141 continue 142 else: 143 for item in items: 144 #print(item) 145 load_image_fullpath = r"E:\\workSpace\\TMP\\%s.jpg"%name_index 146 suning.load_image_by_imageUrl("http:%s"%item,load_image_fullpath) 147 name_index = name_index + 1 148 print("FUNC:get_goods_image_from_suning success! image load path is :%s"%load_image_fullpath) 149 except: 150 print("CATCH AN ERROR FUNC:get_goods_image_from_suning") 151 pass 152 153 154 #main 155 def get_adu_image_from_9(): 156 socket.setdefaulttimeout(5.0) 157 adu = CRAWLER() 158 159 flag = '<img src=".*?" file="(.*?)".*?οnmοuseοver=.*?alt=.*?/>' 160 page_index = 171186 #160202 #150007 161 black_keyword_list = ["您无权进行当前操作,原因如下","对不起,找不到页面!<br>"] 162 must_keyword_list = ["attachments"] 163 164 image_download_num = 0 165 while True: 166 try: 167 print("------------------------------------------------------------------------") 168 items = adu.select_items_from_url('http://********tid=%s'%page_index,flag,black_keyword_list,must_keyword_list) 169 page_index = page_index + 1 170 #print(items) 171 if items == "": 172 print("") 173 continue 174 else: 175 for item in items: 176 #print("tettt_item =",item) 177 image_name = item.split("/")[1] 178 print("image_name =",image_name) 179 load_image_fullpath = r"E:\\workSpace\\TMP\\ad_image\\%s_%s"%(page_index-1,image_name) 180 adu.load_image_by_imageUrl("http://********/%s"%item,load_image_fullpath) 181 image_download_num = image_download_num + 1 182 print("FUNC:get_adu_image_from_9 success! image load path is :%s"%load_image_fullpath) 183 print("image_download_num now is %s \n"%image_download_num) 184 except: 185 print("CATCH AN ERROR FUNC:get_adu_image_from_9\n") 186 187 188 def love_letter(): 189 love_ch = CRAWLER() 190 love_ch.send_email(["50********9@qq.com","46********@qq.com"],"LOVE MAIL 05","我爱你!") 191 192 193 ############################################################################ 194 def main(): 195 #get_adu_image_from_9() 196 love_letter() 197 198 main()


 

#执行结果

1.爬取图片(大概运行1小时,效率还凑合):

 

2.发送邮件:

 

转载于:https://www.cnblogs.com/chenyuebai/p/6728532.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值