Python爬虫,下载豆瓣小组图片
# -*- coding: utf-8 -*-
# -----------------------------------------------
# 程序:豆瓣小组图片爬虫
# 版本:1.0
# 语言:Python 3.4
# 作者:gdp12315
# 操作:输入豆瓣小组讨论版块地址、起始页面、终止页面
# 功能:下载小组帖子里发布的图片
# 注意:下载的保存地址为作者本机地址 读者根据自身情况更改
# -----------------------------------------------
import random
import socket
import http.cookies
import http.cookiejar
import urllib.request,re,time
ERROR = {
'0':'Can not open the url,checck you net',
'1':'Creat download dir error',
'2':'The image links is empty',
'3':'Download faild',
'4':'Build soup error,the html is empty',
'5':'Can not save the image to your disk',
}
class BrowserBase(object):
def __init__(self):
socket.setdefaulttimeout(20)
def speak(self,name,content):
print('[%s]%s', name,content)
def openurl(self,url):
"""
打开网页
"""
cookie_support= urllib.request.HTTPCookieProcessor(http.cookiejar.CookieJar())
self.opener = urllib.request.build_opener(cookie_support,urllib.request.HTTPHandler)
urllib.request.install_opener(self.opener)
user_agents = [
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",
]
agent = random.choice(user_agents)
self.opener.addheaders = [("User-agent",agent),("Accept","*/*"),('Referer',url)]
try:
res = self.opener.open(url)
#print(res.read())
except Exception as e:
self.speak(str(e),url)
raise Exception
else:
return res
if __name__=='__main__':
splider=BrowserBase()
# ------------ begin ----------------------------
# 输入示例
# http://www.douban.com/group/Xsz/discussion?start=
# 1
# 2
#print('请输入豆瓣小组地址,去掉start=后面的数字')
url = str(input(u'请输入豆瓣小组地址,去掉start=后面的数字:\n'))
#url = 'http://www.douban.com/group/blabla/discussion?start='
page_bgn = int(input(u'请输入开始时的页码:\n'))
page_end = int(input(u'请输入结束时的页码:\n'))
num_end = (page_end-1)*25
num_now = (page_bgn-1)*25
while num_now <= num_end:
# 获得主题列表页面
html_topic_list = splider.openurl(url+str(num_now)).read().decode('utf-8')
# 获得主题列表
re_topic_list = re.compile(r'http://www\.douban\.com/group/topic/\d+')
topic_list = re_topic_list.findall(html_topic_list)
# 遍历每个主题 将其中图片下载下来
for topic_url in topic_list:
print('topic_url '+topic_url)
html_topic = splider.openurl(topic_url).read().decode('utf-8')
# 进入主题 获得图片下载地址列表(图片可能有多张)
re_img_list = re.compile(r'http://img\d\.douban\.com/view/group_topic/large/public/.+\.jpg')
img_list = re_img_list.findall(html_topic)
# 遍历图片下载地址列表 把每张图片保存到对应位置
for img_url in img_list:
print('img_url: '+img_url)
img_name = re.findall(r'p\d{7}',img_url)
download_img = urllib.request.urlretrieve(img_url,'D:\Python\pics\%s.jpg'%img_name)
time.sleep(2)
num_now = num_now + 25
else:
print('采集完成!')