一个批量爬取图片的代码,拿走不谢

最新推荐文章于 2024-04-16 10:06:03 发布

qyu21490

最新推荐文章于 2024-04-16 10:06:03 发布

阅读量284

点赞数

分类专栏：爬虫文章标签： python url web服务器

本文链接：https://blog.csdn.net/weixin_45483906/article/details/115669501

版权

爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

import io
import sys

import requests
from bs4 import BeautifulSoup
import os
if not os.path.exists("root"):
    os.mkdir("root")

max_page=1208
start_page=2
web="http://www.netbian.com"
test_url="/index.htm"
url="/index_{}.htm"
headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36"
}
pre_page=""
try:
    resp=requests.get(web+test_url,headers=headers)
    pre_page=resp.text
    resp.close()
    if not resp.ok:
        print("无法连接网站")
        quit()
except:
    print("无法连接网站异常")
    quit()

print("连接成功,正在爬取...")

for i in range(start_page,max_page+1):
    pageurl=web+url.format(i)
    try:
        resp=requests.get(pageurl,headers=headers)
        html_page=resp.text
        resp.close()
    except:
        print("无法爬取页面",i,"正在跳过")
        continue

    html_page=BeautifulSoup(html_page,"html.parser")
    div=html_page.find("div",class_="list")
    a=div.find_all("a")
    addr=[i.get("href") for i in a]
    de=[]
    for j in range(len(addr)):
        if str(addr[j]).find("http")<0:
            de.append(addr[j])
            
    print("在第{}个界面搜索到".format(i),de)
    if not os.path.exists("root/{}".format(i)):
        os.mkdir("root/{}".format(i))
    for ph in de:
        try:
            turl=web+ph
            
            tresp=requests.get(turl,headers=headers)
            tpage=BeautifulSoup(tresp.text,"html.parser")
            img_path=tpage.find("div",class_="pic").find("img").get("src")
            print("\r爬取",img_path,end="")
            tresp.close()
            tresp=requests.get(img_path,headers=headers)
            name=img_path.split("/")[-1]
            f=open("root/"+str(i)+"/"+str(name),"wb")
            f.write(tresp.content)
            tresp.close()
            f.close()
        except:
            print("\n")
            print("爬取{}失败...".format(ph))
            continue

qyu21490

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
一个批量爬取图片的代码,拿走不谢

import ioimport sysimport requestsfrom bs4 import BeautifulSoupimport osif not os.path.exists("root"): os.mkdir("root")max_page=1208start_page=2web="http://www.netbian.com"test_url="/index.htm"url="/index_{}.htm"headers={ "User-Agent":
复制链接

扫一扫