学习：一个多线程爬虫下载图片

最新推荐文章于 2024-03-14 22:21:44 发布

sdadadagav

最新推荐文章于 2024-03-14 22:21:44 发布

阅读量275

点赞数

文章标签： python 多线程

本文链接：https://blog.csdn.net/u010253510/article/details/106305442

版权

import requests
import re
from bs4 import BeautifulSoup
import os
import urllib
import threading    #导入多线程库


def getTEXT(url):
    try:
        kv = {"user-agent": "mozilla/5.0"}
        r = requests.get(url, headers=kv)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return '错误'

def get_erery_site_url(text):
    every_urls = []
    every_names = []
    b = BeautifulSoup(text, "html.parser")
    for tb in b.find_all('h2'):
        th = tb.find('a')
        every_urls.append(th.attrs['href'])
        every_names.append(th.contents)
    return every_urls,every_names

def congwangzhi_natudewangzhi(wangzhi):
    shuju=getTEXT(wangzhi)
    chuli=BeautifulSoup(shuju, "html.parser")
    tb=chuli.find(id="post_content")
    tupianwangzhis=[]
    for tc in tb.find_all('p'):
        try:
            th2=tc.a.attrs['href']
            tupianwangzhis.append(th2)
        except:
            for td in tc.find_all('img'):
                tupianwangzhis.append(td.attrs['src'])
    return tupianwangzhis

def xiazaitupian(tupianwangzhis,path,biaoti):
    print(path+'图片开始下载，注意查看文件夹')
    if not os.path.isdir(path):
        os.makedirs(path)  # 判断没有此路径则创建
    paths = path + '\\'
    for tupianwangzhi in tupianwangzhis:
        tu = requests.get(tupianwangzhi)
        with open(paths+biaoti[0]+tupianwangzhi[-6:], mode='wb') as obj:
            obj.write(tu.content)
    
def pachong(i):
    url='https://www.vooc.net/page/'+str(i)
            
    a = getTEXT(url)
    wangzhis,biaotis=get_erery_site_url(a)
    for biaoti,wangzhi in zip(biaotis,wangzhis):
        tupianwangzhis=congwangzhi_natudewangzhi(wangzhi)
        path='E:\\tupian'
        # path='E:\\tupian\\' + biaoti[0]
        xiazaitupian(tupianwangzhis,path,biaoti)

i = int(input('你想爬取的页数：'))
for x in range(1,i+1):
    threading.Thread(target=pachong,args=(x,)).start() # 启动多线程