豆瓣爬虫pyton3

最新推荐文章于 2020-12-21 03:27:21 发布

18923489164

最新推荐文章于 2020-12-21 03:27:21 发布

阅读量802

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/AnYeZhiYin/article/details/83449636

版权

爬虫专栏收录该内容

117 篇文章 14 订阅

订阅专栏

只用了一下午我ip被封了。试了换IP 都不行，应该他们把免费的代理IP都收集了
然后发现另一个办法就是换Cookie 先用自己的账号在浏览器上面登陆，然后复制出Cookie 换上就可以用了

import urllib.request

import re

from bs4 import BeautifulSoup as bs

from urllib.request import quote

import random

def urlopen(url):

    head = {}
    head['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
 #   head['Accept-Encoding'] = 'gzip, deflate, br'
    head['Accept-Language'] = 'zh-CN,zh;q=0.9'
    head['Cache-Control'] = 'no-cache'
    head['Connection'] = 'keep-alive'
    head['Cookie'] = 'viewed="30209224"; bid=Wv1u2my5GJI; gr_user_id=ec943490-8875-40fe-b5b9-538d784cbf84; _vwo_uuid_v2=D6C966AC33758154BD3FC61FB43687FE2|718456dd9cd6126870e9d38c3a11a25e; douban-fav-remind=1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=88341540-2549-49a5-9bbc-398f72264aa6; gr_cs1_88341540-2549-49a5-9bbc-398f72264aa6=user_id%3A0; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1540562323%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DFgx2sSw757rrLsA5hTtkRWNmEqYyh7bKLBtfX58Svc5xUbpPONhB3uzxN48sO0Zu%26wd%3D%26eqid%3Df30205fb00003607000000065bd31d79%22%5D; _pk_ses.100001.3ac3=*; ap_v=0,6.0; __utma=30149280.1606666109.1528767205.1534148313.1540562324.3; __utmc=30149280; __utmz=30149280.1540562324.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt_douban=1; __utma=81379588.1156273593.1540562324.1540562324.1540562324.1; __utmc=81379588; __utmz=81379588.1540562324.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_88341540-2549-49a5-9bbc-398f72264aa6=true; __yadk_uid=HYwciqRaHPiOYtoDfSppGXtRAJ45ZXiD; _pk_id.100001.3ac3=f9fe87f282b62e3b.1540562323.1.1540562400.1540562323.; __utmb=30149280.4.10.1540562324; __utmb=81379588.4.10.1540562324'
    head['Host'] = 'book.douban.com'
    head['Pragma'] = 'no-cache'
    head['Upgrade-Insecure-Requests'] = '1'
    head['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    
    
    req = urllib.request.Request(url,headers = head)

    html = urllib.request.urlopen(req)

    html = html.read()

    return html


def contlist(url):

    html = urlopen(url)

    html = html.decode('utf-8')

    htm = bs(html,'lxml')

    re_url = htm.find_all('ul',class_="hot-tags-col5 s")

    re_url =re_url[0].find_all('a')

    r_list = []

    for i in re_url:
        i = str(i)
        ff = re.findall(r'a class="tag"',i)
        if len(ff)== 1:
            i =bs(i,'lxml')
            i = i.a.attrs
            i = i['href']
            i = quote(i)
            i = 'https://book.douban.com'+i
            r_list.append(i)

    return r_list
            
        
    

url = 'https://book.douban.com/'

content = contlist(url)

print(len(content))

for i in content:

    htm = urlopen(i)
    print(i)

    htm = bs(htm,'lxml')
        
    h2 = htm.find_all('h2')

    Author = htm.find_all('div',class_="pub")


    for i in range(len(Author)):
        
        h2_con = h2[i].a.attrs

        Author_con = Author[i].text.strip()

        url2 = h2_con['href']

        title = h2_con['title']

        print('书名：'+title)
        print('作者：'+Author_con)
        print('链接：'+url2)
        print('\n')

18923489164

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
豆瓣爬虫pyton3

只用了一下午我ip被封了import urllib.requestimport refrom bs4 import BeautifulSoup as bsfrom urllib.request import quoteimport randomdef urlopen(url): head = {} head['Accept'] = 'text/html,a...
复制链接

扫一扫