一个python单线程爬虫,爬取表情包,新手shiyon

import requests
from bs4 import BeautifulSoup
def biaoqingbao(beginPage,endPage):
    url = "https://www.doutula.com/article/list/?page="
    sess = requests.Session()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36',
        'cookie' : '__cfduid=d24b0865c62cf58b642f4f35ffec61ce21515548138; UM_distinctid=160ddb55d7b280-09e1869e5aa032-35447354-1fa400-160ddb55d7c290; XSRF-TOKEN=eyJpdiI6IjZvSWZSVFFyeW50Smdtc2t0ZDFjc3c9PSIsInZhbHVlIjoiUnNGdU5EdzhWQ1JPVFRSalU5aWpzdFdleWRcL3FSWTdwaEtZXC84U3RxcjZXeTV4MWJmNVRaSVZYdHQxWDIxdHpuRnpMRjJBbUJjR2NIN1U3cnNOS1k1Zz09IiwibWFjIjoiOTg2ZjVkNTdkMmM5YjMyZDNiNjRmNzAxMjJiZTc5OGNiZTVlOGMxNmMzMmY0MWY3NzU4MjMzZTM1OWRjZjkxMiJ9; laravel_session=eyJpdiI6ImxrUERWQ0NpSHFxRDNSc2VsaFd0dWc9PSIsInZhbHVlIjoicG9COGJOU1wvT1JqaENJckY1a0xcL2pXVWxIaG5KMEY1SGJuRGlFd1wvWnkwdEZLRXl5QUNxbXpYZEVQN3N6XC96UCt1ZGFlOEEreHk1WWtRV3E1UXBscjVRPT0iLCJtYWMiOiI2OTQ3MWY1MDQ4MjNkZGZmNmJiNTU4OWZlNDkxYTUzNTIzNjg0MzBhZDg1YzE1MWFkOGM2MjkwNmVhOWQ1ZmU0In0%3D; _ga=GA1.2.1818345319.1515548139; _gid=GA1.2.101140113.1515548139; yjs_id=aHR0cHM6Ly93d3cuZG91dHVsYS5jb20vYXJ0aWNsZS9saXN0Lz9wYWdlPTF8MTUxNTU1Njg0MTU2MQ; CNZZDATA1256911977=2123601880-1515544387-%7C1515553089',
        # 'referer': 'https: // www.doutula.com / article / list /?page = 1'
    }
    for page in range(beginPage,endPage + 1):
        filename = "第" + str(page) + "页.html"
        finalUrl = url + str(page)
        html = sess.get(finalUrl,headers = headers,timeout = 500).content.decode()
        bs = BeautifulSoup(html, "lxml")
        img = bs.find_all(attrs={"class": "lazy image_dtb img-responsive"})
        # print(html)
        list = []
        for src in img:
            img_src = src.get('data-original')
            if img_src.find('http') != -1:
                print(img_src)
                list.append(img_src)
                writeImg(img_src, headers)
        print('爬取完成' + str(len(list)) + "条")


        # print(list)
def writeImg(link,headers):
    sess = requests.Session()
    request = sess.get(link , headers = headers).content
    filename = link[-12:]
    with open('bqb'+filename,"wb") as f :
        f.write(request)




if __name__ == "__main__":
    beginPage = int(input("请输入起始页"))
    endPage = int(input("请输入结束页"))
    biaoqingbao(beginPage,endPage)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

羽灬翎

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值