爬虫之壁纸批量爬取

import requests
import os
import re
from bs4 import BeautifulSoup

def f(url_data):
    url_data=url_data.split("/")
    s=''
    for i in range(len(url_data)-1):
        s+=str(url_data[i])+'/'
    return s

headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
def getsum(url,name):
    get_page_text = requests.get(url=url, headers=headers)
    get_page_text.encoding = 'gb2312'
    tt_soup = BeautifulSoup(get_page_text.text, 'lxml')
    img_sum_data=list(tt_soup.find('div',class_='wzfz tu-tit fix'))
    pattern = r'.*?\d/(\d{1,3})'
    img_sum = int(re.findall(pattern, img_sum_data[1].text, re.S)[0])
    getAll(img_sum,url,name)

def getAll(img_sum,url,name):
    page_all=[]
    for i in range(img_sum-1):
        index_url = 'http://www.jj20.com'
        s = f(url)
        page_text=requests.get(url=url,headers=headers)
        page_text.encoding = 'gb2312'
        soup = BeautifulSoup(page_text.text, 'lxml')
        next=soup.select('.next')
        page_all.append(url)
        url=s+next[0]['href']
    down(page_all,name)
def down_img(url,name,n):
    page_text=requests.get(url=url,headers=headers).content
    path=''
    if not os.path.exists(f'./壁纸'):
        os.mkdir(f'./壁纸')
        path=f'./壁纸/{name}'
    if not os.path.exists(f'./壁纸/{name}'):
        os.mkdir(f'./壁纸/{name}')
        path=f'./壁纸/{name}'
    else:
        path=f'./壁纸/{name}'
    with open(f'{path}/{name}{n}.jpg','wb')as f:
        f.write(page_text)
        print(f'{name}{n}.jpg 爬取成功!')
def down(page_all,name):
    n = 0
    for i in page_all:
        n+=1
        page_text = requests.get(url=i, headers=headers)
        page_text.encoding = 'gb2312'
        soup = BeautifulSoup(page_text.text, 'lxml')
        # print(soup)
        img_url=soup.select('.photo img')[0]['src']
        down_img(img_url,name,n)
def main():
    url='http://www.jj20.com'
    page_text=requests.get(url=url,headers=headers)
    page_text.encoding='gb2312'
    soup=BeautifulSoup(page_text.text,'lxml')
    tt_url_data=soup.select('.picbz>li>a')
    tt_url=[]
    tt_url_name=[]
    for i in tt_url_data:
        tt_url.append(i['href'])
        tt_url_name.append(i['title'])
    for i in range(len(tt_url)):
        img_url=url+tt_url[i]
        img_name=tt_url_name[i]
        getsum(img_url,img_name)
if __name__=='__main__':
    main()


更新

import requests
import os
import lxml
from bs4 import BeautifulSoup

def f(url_data):
    url_data=url_data.split("/")
    s=''
    for i in range(len(url_data)-1):
        s+=str(url_data[i])+'/'
    return s

headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
def getAll(url,name):
    page_all=[]
    try:
        index_url = 'http://www.jj20.com'
        s = f(url)
        page_text=requests.get(url=url,headers=headers)
        page_text.encoding = 'gb2312'
        soup = BeautifulSoup(page_text.text, 'lxml')
        next=soup.select('.next')
        page_all.append(url)
        url=s+next[0]['href']
    except:
        print(1)
    down(page_all,name)
def down_img(url,name,n):
    page_text=requests.get(url=f"http:{url}",headers=headers).content
    if not os.path.exists(f'./壁纸'):
        os.mkdir(f'./壁纸')
    if not os.path.exists(f'./壁纸/{name}'):
        os.mkdir(f'./壁纸/{name}')
    with open(f'./壁纸/{name}/{name}{n}.jpg','wb')as f:
        f.write(page_text)
        print(f'{name}{n}.jpg 爬取成功!')
def geturl(url):
    page_text = requests.get(url=url, headers=headers)
    page_text.encoding = 'gb2312'
    soup = BeautifulSoup(page_text.text, 'lxml')
    img_url=soup.select(".photo>a>img")
    return img_url[0]['src']
def down(page_all,name):
    for i in page_all:
        n = 0
        url_1 = i
        while True:
            try:
                n+=1
                if n>=2:
                    img_url=getnext(url_1)
                    img=f"http://www.jj20.com/bz/nxxz/shxz/{img_url}"
                    url_1=img
                    url=geturl(img)
                    down_img(url, name, n)
                else:
                    url = geturl(i)
                    down_img(url, name, n)
            except:
                break
def getnext(url):
    page_text = requests.get(url=url, headers=headers)
    page_text.encoding = 'gb2312'
    soup = BeautifulSoup(page_text.text, 'lxml')
    next=soup.select(".next")
    return next[0]['href']
def main():
    url='http://www.jj20.com/bz/nxxz/list_7_cc_14.html'
    page_text=requests.get(url=url,headers=headers)
    page_text.encoding='gb2312'
    soup=BeautifulSoup(page_text.text,'lxml')
    tt_url_data=soup.select('.picbz>li>a')
    tt_name = soup.select('.picbz>li>a>img')
    tt_url=[]
    tt_url_name=[]
    for i in tt_url_data:
        tt_url.append(f"http://www.jj20.com/{i['href']}")
    for i in tt_name:
        tt_url_name.append(i['alt'])
    tt_url_1=[]
    for i in tt_url:
        if i not in tt_url_1:
            tt_url_1.append(i)
    for i in range(len(tt_url_1)):
        img_url=tt_url_1[i]
        img_name=tt_url_name[i]
        getAll(img_url,img_name)
if __name__=='__main__':
    main()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值