python3.6 urllib.request库实现简单的网络爬虫、下载图片

最新推荐文章于 2021-04-06 21:06:31 发布

weixin_30487201

最新推荐文章于 2021-04-06 21:06:31 发布

阅读量155

点赞数

文章标签：爬虫 python 操作系统

原文链接：http://www.cnblogs.com/chenyuebai/p/6728532.html

版权

#更新日志：
#0418 爬取页面商品URL
#0421 更新添加爬取下载页面图片功能
#0423 更新添加发送邮件功能
# 优化爬虫异常处理、错误页面及空页面处理
# 优化爬虫关键字黑名单、白名单，提高效率

################################################################# 
#author: 陈月白 
#_blogs: http://www.cnblogs.com/chenyuebai/ 
#################################################################

　1 # -*- coding: utf-8 -*-
  2 import urllib.request
  3 import sys
  4 import traceback
  5 import re
  6 import socket
  7 import smtplib
  8 from email.mime.text import MIMEText
  9 from email.utils import formataddr
 10 
 11 socket.setdefaulttimeout(15.0)
 12 
 13 class CRAWLER():
 14     #初始化变量
 15     def __init__(self):
 16         self.pageIndex = 0
 17         self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
 18         self.headers = {'User-Agent':self.user_agent}
 19         
 20         
 21     #传入网页链接，抓取页面html数据
 22     def get_page(self,url,black_keyword_list=[],must_keyword_list=[]):
 23         try:
 24             page_data = ""
 25             request = urllib.request.Request(url,headers=self.headers)
 26             response = urllib.request.urlopen(request,timeout=10)
 27             page_data = response.read().decode()
 28             #print("####################\n",page_data)
 29         except:
 30             print("get_page %s catch a error,now return"%url)
 31             #traceback.print_exc()
 32             return page_data
 33 
 34         #设置黑名单关键字过滤无效网页，即page_data中若存在该关键字则视为page_data为空
 35         if black_keyword_list:
 36             for black_keyword in black_keyword_list:
 37                 if black_keyword in page_data:
 38                     print("black_keyword =",black_keyword)
 39                     page_data = ""
 40                     return page_data            
 41             
 42         #设置page_data必须包含的关键字，若不含则视为page_data为空
 43         if must_keyword_list:
 44             for must_keyword in must_keyword_list:
 45                 if not must_keyword in page_data:
 46                     print("must_keyword: [%s] is not in page_data,now let page_data is empty!"%must_keyword)
 47                     page_data = ""
 48                     return page_data
 49 
 50         if page_data == '':
 51             print("EXEC:get_page(%s) failed,page_data is empty..."%url)
 52             return page_data
 53         else:
 54             print("EXEC:get_page(%s)success!"%url)
 55             return page_data
 56             
 57             
 58     #入口，传入url，初步筛选信息
 59     def select_items_from_url(self,url,flag,black_keyword_list=[],must_keyword_list=[]):
 60         print("url =",url)
 61         print("flag =",flag)
 62         page_data = self.get_page(url,black_keyword_list,must_keyword_list)
 63         #print("page_data =",page_data)
 64         if page_data == "":
 65             print("page_data =",page_data)
 66             return  page_data
 67         #print("-----",page_data)
 68         
 69         pattern = re.compile(flag,re.S)
 70         print("pattern =",pattern)
 71         items = re.findall(pattern,page_data)
 72         #print("FUNC:select_items_from_url items =",items)
 73         if items == "":
 74             print("EXEC:select_items_from_url failed,select items is empty")
 75             return items
 76         else:
 77             print("EXEC:select_items_from_url success!")
 78             # print("items =",items)
 79             return items
 80             
 81     
 82     def load_image_by_imageUrl(self,image_url,image_load_fullpath):
 83         print("image_url =",image_url)
 84         print("image_load_fullpath =",image_load_fullpath)
 85 #         try:
 86 #             urllib.request.urlretrieve(image_url,image_load_fullpath)
 87 #         except:
 88 #             print("CATCH AN ERROR    FUNC:load_image_by_imageUrl %s failed..."%image_url)
 89 #             return
 90         try:
 91             opener = urllib.request.build_opener()
 92             opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36')]
 93             urllib.request.install_opener(opener)
 94             
 95             print("now start to load %s"%image_url)
 96             urllib.request.urlretrieve(image_url,image_load_fullpath)
 97             print("FUNC:load_image_by_imageUrl %s success!"%image_url)
 98         except:
 99             print("CATCH AN ERROR    FUNC:load_image_by_imageUrl %s failed..."%image_url)
100             return
101         
102     def start(self):
103         pass
104     
105     def send_email(self,receive_user,topic_name,body_text):
106         send_user = "85********2@qq.com"
107         send_passwd = "********"
108         
109         try:
110             msg = MIMEText(body_text,'plain','utf-8')      #邮件内容
111             msg['From'] = formataddr(["********",send_user])      #收件人邮箱昵称、收件人邮箱账号
112             msg['Subject'] = topic_name   #邮件主题s
113             
114             server = smtplib.SMTP("smtp.qq.com",25)
115             server.set_debuglevel(1)
116             server.connect("smtp.qq.com")
117             server.ehlo()  
118             server.starttls()  
119             server.ehlo()  
120             server.login(send_user,send_passwd)    #登录
121             server.sendmail(send_user,receive_user,msg.as_string()) 
122             server.quit()
123             print("send mail to %s success!"%receive_user)
124         except:
125             print("send mail to %s failed!"%receive_user)
126 
127         
130 #main
131 def get_goods_image_from_suning():
132     suning = CRAWLER()
133     flag = '<img class="search-loading" width="220" height="220" src2="(.*?)"></a>'
134     page_num = 5
135     name_index = 0
136     try:
137         for i in range(page_num):
138             items = suning.select_items_from_url('https://list.suning.com/0-20006-%s.html'%i,flag)
139             #print(items)
140             if items == "":
141                 continue
142             else:
143                 for item in items:
144                     #print(item)
145                     load_image_fullpath = r"E:\\workSpace\\TMP\\%s.jpg"%name_index
146                     suning.load_image_by_imageUrl("http:%s"%item,load_image_fullpath)
147                     name_index = name_index + 1
148         print("FUNC:get_goods_image_from_suning success! image load path is :%s"%load_image_fullpath)
149     except:
150         print("CATCH AN ERROR    FUNC:get_goods_image_from_suning")
151         pass
152 
153 
154 #main
155 def get_adu_image_from_9():
156     socket.setdefaulttimeout(5.0)
157     adu = CRAWLER()
158     
159     flag = '<img src=".*?" file="(.*?)".*?οnmοuseοver=.*?alt=.*?/>'
160     page_index = 171186       #160202       #150007
161     black_keyword_list = ["您无权进行当前操作，原因如下","对不起，找不到页面！<br>"]
162     must_keyword_list = ["attachments"]
163     
164     image_download_num = 0
165     while True:
166         try:
167             print("------------------------------------------------------------------------")
168             items = adu.select_items_from_url('http://********tid=%s'%page_index,flag,black_keyword_list,must_keyword_list)
169             page_index = page_index + 1
170             #print(items)
171             if items == "":
172                     print("")
173                     continue
174             else:
175                 for item in items:
176                     #print("tettt_item =",item)
177                     image_name = item.split("/")[1]
178                     print("image_name =",image_name)
179                     load_image_fullpath = r"E:\\workSpace\\TMP\\ad_image\\%s_%s"%(page_index-1,image_name)
180                     adu.load_image_by_imageUrl("http://********/%s"%item,load_image_fullpath)
181                     image_download_num = image_download_num + 1
182                     print("FUNC:get_adu_image_from_9 success! image load path is :%s"%load_image_fullpath)
183                     print("image_download_num now is %s \n"%image_download_num)
184         except:
185             print("CATCH AN ERROR    FUNC:get_adu_image_from_9\n")
186 
187                            
188 def love_letter():
189     love_ch = CRAWLER()
190     love_ch.send_email(["50********9@qq.com","46********@qq.com"],"LOVE MAIL 05","我爱你!")
191 
192 
193 ############################################################################
194 def main():
195     #get_adu_image_from_9()
196     love_letter()
197 
198 main()

#执行结果

1.爬取图片（大概运行1小时，效率还凑合）：

2.发送邮件：

转载于:https://www.cnblogs.com/chenyuebai/p/6728532.html

weixin_30487201

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python3.6 urllib.request库实现简单的网络爬虫、下载图片

#更新日志：#0418 爬取页面商品URL#0421 更新添加爬取下载页面图片功能#0423 更新添加发送邮件功能# 优化爬虫异常处理、错误页面及空页面处理# 优化爬虫关键字黑名单、白名单，提高效率################################################################# #author: 陈月白 #_b...
复制链接

扫一扫