这是需要爬取的网站首页,一些女人的照片。至于为什么要爬这个,我想只有女人的照片才会被集中,分模块的放在网站里面,其他的素材很少。https://pic.yesky.com/这是网址。
import os
import requests
import random
import re
import time
from bs4 import BeautifulSoup
导入一些必要的包
前期的在准备工作,代码有点长,放图片了,显示os修改存储的位置,然后随机的headers_list,然后是url_list因为i要批量爬取,跑函数的时候遍历列表就行。
然后是最基础的requests的基本语法,返回一个html给下个函数分析用
这里分析我用的是beautifulsoup和re配合使用,先大范围的找到一段html源码再用re分析。然后去除列表里相同的网址。得到应该png_1的列表
然后是保存部分这个就很简单了,找到我们要爬取的png地址之后再重写一遍requests爬取图片,其实在这里直接调用第一个函数是不是会更好一点,如果遇到大工程的话这样写会很麻烦。我一般喜欢用当前的时间戳来为图片命名(时间戳代码我根本不不会写,每次都是找到以前的代码copy,难蚌),这个每个人都有不同的方法,不多说了。然后就算一些细节,显示爬取的个数,不再赘述了。
最后是调用这些函数。代码就此结束了。
所有的代码如下所示,欢迎各位交流,感谢各位的支持。
import os
import requests
import random
import re
import time
from bs4 import BeautifulSoup
# 修改文件位置
os.chdir("C:/Users/Secret/Desktop/New folder")
headers_list = [
{
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android 8.0.0; SM-G955U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Mobile Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android 10; SM-G981B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Mobile Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (iPad; CPU OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/87.0.4280.77 Mobile/15E148 Safari/604.1"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.109 Safari/537.36 CrKey/1.54.248666"
},
{
"user-agent": "Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.188 Safari/537.36 CrKey/1.54.250320"
},
{
"user-agent": "Mozilla/5.0 (BB10; Touch) AppleWebKit/537.10+ (KHTML, like Gecko) Version/10.0.9.2372 Mobile Safari/537.10+"
},
{
"user-agent": "Mozilla/5.0 (PlayBook; U; RIM Tablet OS 2.1.0; en-US) AppleWebKit/536.2+ (KHTML like Gecko) Version/7.2.1.0 Safari/536.2+"
},
{
"user-agent": "Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30"
},
{
"user-agent": "Mozilla/5.0 (Linux; U; Android 4.1; en-us; GT-N7100 Build/JRO03C) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30"
},
{
"user-agent": "Mozilla/5.0 (Linux; U; Android 4.0; en-us; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Mobile Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=true"
},
{
"user-agent": "Mozilla/5.0 (Linux; U; Android 4.4.2; en-us; LGMS323 Build/KOT49I.MS32310c) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/102.0.0.0 Mobile Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mobile Safari/537.36 Edge/14.14263"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 7 Build/MOB30X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 520)"
},
{
"user-agent": "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android 9; Pixel 3 Build/PQ1A.181105.017.A1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.158 Mobile Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android 10; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android 11; Pixel 3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.181 Mobile Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Mobile Safari/537.36"
},
{
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1"
},
{
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
},
{
"user-agent": "Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1"
},
]
# proxy_list = [
# {"https": "61.216.185.88"},
# {"https": "123.169.34.216"},
# {"https": "36.134.91.82"},
# {"https": "183.64.239.19"},
# {"https": "123.169.34.113"},
# {"https": "222.66.202.6"},
# ]
# proxy = random.choice(proxy_list)
# 随机的头文件
headers = random.choice(headers_list)
# url列表
url_list = [
"https://pic.yesky.com/239/2147470739.shtml",
"https://pic.yesky.com/62/2147471562.shtml",
"https://pic.yesky.com/64/2147471564.shtml",
"https://pic.yesky.com/318/2147474818.shtml",
"https://pic.yesky.com/43/2147468543.shtml",
"https://pic.yesky.com/197/2147471697.shtml",
"https://pic.yesky.com/93/2147471093.shtml",
"https://pic.yesky.com/230/2147470730.shtml",
"https://pic.yesky.com/195/2147471695.shtml",
"https://pic.yesky.com/95/2147472595.shtml",
"https://pic.yesky.com/326/2147478826.shtml",
"https://pic.yesky.com/431/2147475431.shtml",
"https://pic.yesky.com/226/2147470726.shtml",
"https://pic.yesky.com/432/2147473432.shtml",
"https://pic.yesky.com/305/2147478805.shtml",
"https://pic.yesky.com/237/2147470737.shtml",
"https://pic.yesky.com/425/2147473425.shtml",
"https://pic.yesky.com/86/2147471086.shtml",
"https://pic.yesky.com/84/2147471084.shtml",
"https://pic.yesky.com/103/2147472603.shtml",
"https://pic.yesky.com/316/2147478816.shtml",
"https://pic.yesky.com/198/2147471698.shtml",
"https://pic.yesky.com/435/2147475435.shtml",
"https://pic.yesky.com/67/2147471567.shtml",
"https://pic.yesky.com/91/2147472591.shtml",
"https://pic.yesky.com/96/2147472596.shtml",
"https://pic.yesky.com/107/2147472607.shtml",
"https://pic.yesky.com/100/2147471100.shtml",
"https://pic.yesky.com/66/2147471566.shtml",
"https://pic.yesky.com/102/2147472602.shtml",
"https://pic.yesky.com/99/2147472599.shtml",
"https://pic.yesky.com/334/2147478834.shtml",
"https://pic.yesky.com/433/2147475433.shtml",
"https://pic.yesky.com/95/2147471095.shtml",
"https://pic.yesky.com/36/2147468536.shtml",
"https://pic.yesky.com/90/2147472590.shtml",
"https://pic.yesky.com/202/2147471702.shtml",
"https://pic.yesky.com/69/2147471569.shtml",
"https://pic.yesky.com/322/2147478822.shtml",
"https://pic.yesky.com/81/2147472581.shtml",
"https://pic.yesky.com/235/2147470735.shtml",
"https://pic.yesky.com/311/2147478811.shtml",
"https://pic.yesky.com/72/2147471572.shtml",
"https://pic.yesky.com/90/2147471090.shtml",
"https://pic.yesky.com/346/2147469346.shtml",
"https://pic.yesky.com/408/2147470408.shtml",
"https://pic.yesky.com/82/2147472582.shtml",
"https://pic.yesky.com/310/2147478810.shtml",
"https://pic.yesky.com/102/2147471102.shtml",
"https://pic.yesky.com/110/2147472610.shtml",
"https://pic.yesky.com/345/2147469345.shtml",
"https://pic.yesky.com/260/2147476760.shtml",
"https://pic.yesky.com/92/2147472592.shtml",
"https://pic.yesky.com/227/2147470727.shtml",
"https://pic.yesky.com/80/2147472580.shtml",
"https://pic.yesky.com/93/2147472593.shtml",
"https://pic.yesky.com/309/2147478809.shtml",
"https://pic.yesky.com/223/2147470723.shtml",
"https://pic.yesky.com/88/2147471088.shtml",
"https://pic.yesky.com/53/2147468553.shtml",
"https://pic.yesky.com/256/2147476756.shtml",
"https://pic.yesky.com/236/2147470736.shtml",
"https://pic.yesky.com/60/2147471560.shtml",
"https://pic.yesky.com/228/2147470728.shtml",
"https://pic.yesky.com/65/2147471565.shtml",
"https://pic.yesky.com/89/2147471089.shtml",
"https://pic.yesky.com/84/2147472584.shtml",
"https://pic.yesky.com/368/2147469368.shtml",
"https://pic.yesky.com/306/2147478806.shtml",
"https://pic.yesky.com/344/2147469344.shtml",
"https://pic.yesky.com/47/2147468547.shtml",
"https://pic.yesky.com/94/2147472594.shtml",
"https://pic.yesky.com/37/2147468537.shtml",
"https://pic.yesky.com/406/2147470406.shtml",
"https://pic.yesky.com/199/2147471699.shtml",
"https://pic.yesky.com/68/2147471568.shtml",
"https://pic.yesky.com/83/2147472583.shtml",
"https://pic.yesky.com/324/2147474824.shtml",
"https://pic.yesky.com/229/2147470729.shtml",
"https://pic.yesky.com/81/2147471081.shtml",
"https://pic.yesky.com/233/2147470733.shtml",
"https://pic.yesky.com/251/2147476751.shtml",
"https://pic.yesky.com/85/2147472585.shtml",
"https://pic.yesky.com/201/2147471701.shtml",
"https://pic.yesky.com/92/2147471092.shtml",
"https://pic.yesky.com/89/2147472589.shtml",
"https://pic.yesky.com/321/2147474821.shtml",
"https://pic.yesky.com/99/2147471099.shtml",
]
def geturl(url):
"""获取网页源码"""
resp = requests.get(url=url, headers=headers)
html = resp.content.decode("gb2312")
return html
def anlysis():
"""分析数据"""
html = geturl(i)
soup = BeautifulSoup(html, "html.parser")
soup_1 = str(soup.select(".swiper-slide"))
# re
png = re.findall(r'data-src="(.*?)"', soup_1)
png_1 = list(set(png))
return png_1
def save_photo():
a = 1
png = anlysis()
for j in png:
resp_1 = requests.get(url=j, headers=headers)
print("正在保存第{}张图片".format(a))
# 用当前时间戳为图片命名
with open(
str(time.strftime("%Y_%m_%d" + "%H_%M_%S", time.localtime(time.time())))
+ ".png",
"wb",
) as f:
time.sleep(1)
f.write(resp_1.content)
print("第{}张保存成功".format(a))
a += 1
for i in url_list:
print(i)
geturl(i)
anlysis()
save_photo()