豆瓣爬虫pyton3

只用了一下午 我ip被封了 。试了换IP 都不行,应该他们把免费的代理IP都收集了
然后发现另一个办法就是换Cookie 先用自己的账号在浏览器上面登陆,然后复制出Cookie 换上就可以用了

import urllib.request

import re

from bs4 import BeautifulSoup as bs

from urllib.request import quote

import random

def urlopen(url):

    head = {}
    head['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
 #   head['Accept-Encoding'] = 'gzip, deflate, br'
    head['Accept-Language'] = 'zh-CN,zh;q=0.9'
    head['Cache-Control'] = 'no-cache'
    head['Connection'] = 'keep-alive'
    head['Cookie'] = 'viewed="30209224"; bid=Wv1u2my5GJI; gr_user_id=ec943490-8875-40fe-b5b9-538d784cbf84; _vwo_uuid_v2=D6C966AC33758154BD3FC61FB43687FE2|718456dd9cd6126870e9d38c3a11a25e; douban-fav-remind=1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=88341540-2549-49a5-9bbc-398f72264aa6; gr_cs1_88341540-2549-49a5-9bbc-398f72264aa6=user_id%3A0; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1540562323%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DFgx2sSw757rrLsA5hTtkRWNmEqYyh7bKLBtfX58Svc5xUbpPONhB3uzxN48sO0Zu%26wd%3D%26eqid%3Df30205fb00003607000000065bd31d79%22%5D; _pk_ses.100001.3ac3=*; ap_v=0,6.0; __utma=30149280.1606666109.1528767205.1534148313.1540562324.3; __utmc=30149280; __utmz=30149280.1540562324.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt_douban=1; __utma=81379588.1156273593.1540562324.1540562324.1540562324.1; __utmc=81379588; __utmz=81379588.1540562324.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_88341540-2549-49a5-9bbc-398f72264aa6=true; __yadk_uid=HYwciqRaHPiOYtoDfSppGXtRAJ45ZXiD; _pk_id.100001.3ac3=f9fe87f282b62e3b.1540562323.1.1540562400.1540562323.; __utmb=30149280.4.10.1540562324; __utmb=81379588.4.10.1540562324'
    head['Host'] = 'book.douban.com'
    head['Pragma'] = 'no-cache'
    head['Upgrade-Insecure-Requests'] = '1'
    head['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    
    
    req = urllib.request.Request(url,headers = head)

    html = urllib.request.urlopen(req)

    html = html.read()

    return html


def contlist(url):

    html = urlopen(url)

    html = html.decode('utf-8')

    htm = bs(html,'lxml')

    re_url = htm.find_all('ul',class_="hot-tags-col5 s")

    re_url =re_url[0].find_all('a')

    r_list = []

    for i in re_url:
        i = str(i)
        ff = re.findall(r'a class="tag"',i)
        if len(ff)== 1:
            i =bs(i,'lxml')
            i = i.a.attrs
            i = i['href']
            i = quote(i)
            i = 'https://book.douban.com'+i
            r_list.append(i)

    return r_list
            
        
    

url = 'https://book.douban.com/'

content = contlist(url)

print(len(content))

for i in content:

    htm = urlopen(i)
    print(i)

    htm = bs(htm,'lxml')
        
    h2 = htm.find_all('h2')

    Author = htm.find_all('div',class_="pub")


    for i in range(len(Author)):
        
        h2_con = h2[i].a.attrs

        Author_con = Author[i].text.strip()

        url2 = h2_con['href']

        title = h2_con['title']

        print('书名:'+title)
        print('作者:'+Author_con)
        print('链接:'+url2)
        print('\n')











    


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值