利用第三方库requests爬取
本来要爬帅哥图的,想想还是算了,怕被人怀疑取向,糟糕!
这里的requests.get就代表get请求,跟urlopen不设定data参数差不多
但是requests用起来更加方便,还有很多强大功能有空去研究一下,先占坑
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
import requests
import os
import time
head = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"}
urlbase = "http://www.shuaia.net"
pages_url = [urlbase]
for i in range(2, 5):
pages_url.append(urlbase + "/index_%d.html" % i)
pictures_url = []
for url in pages_url:
req = requests.get(url, head)
req.encoding = "utf-8"
html = req.text
soup = BeautifulSoup(html, 'lxml')
target = soup.find_all("a", class_="item-img")
for picture in target:
name = picture.img.get("alt")
if "花" in name or "女" in name:
picture_url = picture.get("href")
final_link = name + "=" + picture_url
pictures_url.append(final_link)
for eachurl in pictures_url:
name, target_url = eachurl.split("=")
filename = name + ".jpg"
pic_req = requests.get(target_url, head)
pic_req.encoding = "utf-8"
pic_html = pic_req.text
soup = BeautifulSoup(pic_html, 'lxml')
div1 = soup.find("div", class_="wr-single-content-list")
try:
pic_url = urlbase + div1.img["src"]
if "Driver_images" not in os.listdir("F:\\"):
os.makedirs(r"F:\Driver_images")
urlretrieve(pic_url, "F:\\Driver_images\\" + filename)
print(name)
except AttributeError:
print("无效链接!")
# time.sleep(1)
# 小网站不用延时