import io
import sys
import requests
from bs4 import BeautifulSoup
import os
if not os.path.exists("root"):
os.mkdir("root")
max_page=1208
start_page=2
web="http://www.netbian.com"
test_url="/index.htm"
url="/index_{}.htm"
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36"
}
pre_page=""
try:
resp=requests.get(web+test_url,headers=headers)
pre_page=resp.text
resp.close()
if not resp.ok:
print("无法连接网站")
quit()
except:
print("无法连接网站异常")
quit()
print("连接成功,正在爬取...")
for i in range(start_page,max_page+1):
pageurl=web+url.format(i)
try:
resp=requests.get(pageurl,headers=headers)
html_page=resp.text
resp.close()
except:
print("无法爬取页面",i,"正在跳过")
continue
html_page=BeautifulSoup(html_page,"html.parser")
div=html_page.find("div",class_="list")
a=div.find_all("a")
addr=[i.get("href") for i in a]
de=[]
for j in range(len(addr)):
if str(addr[j]).find("http")<0:
de.append(addr[j])
print("在第{}个界面搜索到".format(i),de)
if not os.path.exists("root/{}".format(i)):
os.mkdir("root/{}".format(i))
for ph in de:
try:
turl=web+ph
tresp=requests.get(turl,headers=headers)
tpage=BeautifulSoup(tresp.text,"html.parser")
img_path=tpage.find("div",class_="pic").find("img").get("src")
print("\r爬取",img_path,end="")
tresp.close()
tresp=requests.get(img_path,headers=headers)
name=img_path.split("/")[-1]
f=open("root/"+str(i)+"/"+str(name),"wb")
f.write(tresp.content)
tresp.close()
f.close()
except:
print("\n")
print("爬取{}失败...".format(ph))
continue
一个批量爬取图片的代码,拿走不谢
最新推荐文章于 2024-04-16 10:06:03 发布