Python伪装浏览器请求爬虫豆瓣小组

最新推荐文章于 2021-03-26 18:20:28 发布

Evan_Gu

最新推荐文章于 2021-03-26 18:20:28 发布

阅读量2.5k

点赞数 1

分类专栏：综合文章标签： python 爬虫豆瓣

本文链接：https://blog.csdn.net/gdp12315_gu/article/details/47323613

版权

综合专栏收录该内容

114 篇文章 2 订阅

订阅专栏

Python爬虫，下载豆瓣小组图片

# -*- coding: utf-8 -*-
# -----------------------------------------------
#   程序：豆瓣小组图片爬虫
#   版本：1.0
#   语言：Python 3.4
#   作者：gdp12315
#   操作：输入豆瓣小组讨论版块地址、起始页面、终止页面
#   功能：下载小组帖子里发布的图片
#   注意：下载的保存地址为作者本机地址 读者根据自身情况更改
# -----------------------------------------------

	
import random
import socket
import http.cookies
import http.cookiejar
import urllib.request,re,time


ERROR = {
        '0':'Can not open the url,checck you net',
        '1':'Creat download dir error',
        '2':'The image links is empty',
        '3':'Download faild',
        '4':'Build soup error,the html is empty',
        '5':'Can not save the image to your disk',
    }

class BrowserBase(object): 

    def __init__(self):
        socket.setdefaulttimeout(20)

    def speak(self,name,content):
        print('[%s]%s', name,content)

    def openurl(self,url):
        """
        打开网页
        """
        cookie_support= urllib.request.HTTPCookieProcessor(http.cookiejar.CookieJar())
        self.opener = urllib.request.build_opener(cookie_support,urllib.request.HTTPHandler)
        urllib.request.install_opener(self.opener)
        user_agents = [
                    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
                    'Opera/9.25 (Windows NT 5.1; U; en)',
                    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
                    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
                    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
                    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
                    "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
                    "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",
			
                    ] 
       
        agent = random.choice(user_agents)
        self.opener.addheaders = [("User-agent",agent),("Accept","*/*"),('Referer',url)]
        try:
            res = self.opener.open(url)
            #print(res.read())
        except Exception as e:
            self.speak(str(e),url)
            raise Exception
        else:
            return res
		

if __name__=='__main__':
    splider=BrowserBase()
		
	

# ------------ begin ----------------------------
# 输入示例
# http://www.douban.com/group/Xsz/discussion?start=
# 1
# 2

#print('请输入豆瓣小组地址，去掉start=后面的数字')
url = str(input(u'请输入豆瓣小组地址，去掉start=后面的数字：\n'))
#url =  'http://www.douban.com/group/blabla/discussion?start='
page_bgn = int(input(u'请输入开始时的页码:\n'))
page_end = int(input(u'请输入结束时的页码:\n'))
num_end = (page_end-1)*25
num_now = (page_bgn-1)*25

while num_now <= num_end:
    # 获得主题列表页面
    html_topic_list = splider.openurl(url+str(num_now)).read().decode('utf-8')

    # 获得主题列表
    re_topic_list = re.compile(r'http://www\.douban\.com/group/topic/\d+')
    topic_list = re_topic_list.findall(html_topic_list)

    # 遍历每个主题 将其中图片下载下来
    for topic_url in topic_list:
        print('topic_url '+topic_url)
        html_topic = splider.openurl(topic_url).read().decode('utf-8')

        # 进入主题 获得图片下载地址列表（图片可能有多张）
        re_img_list = re.compile(r'http://img\d\.douban\.com/view/group_topic/large/public/.+\.jpg')
        img_list = re_img_list.findall(html_topic)

        # 遍历图片下载地址列表 把每张图片保存到对应位置
        for img_url in img_list:
            print('img_url: '+img_url)
            img_name = re.findall(r'p\d{7}',img_url)
            download_img = urllib.request.urlretrieve(img_url,'D:\Python\pics\%s.jpg'%img_name)
            time.sleep(2)
    num_now = num_now + 25
else:
    print('采集完成!')