一.目标网站
目标网站:https://imoemei.com/
网站爬取分类:https://imoemei.com/meinv/page/{n}
一共11页
二.网站分析
1.翻页方法
所有翻页全部采用for循环,有更好的更简单的方法请大神指点一下
- 写真栏目
-一共11页,考虑翻页问题
for n in range(1,12):
url=f'https://imoemei.com/meinv/page/{n}'
- 每一页30张封面
2.提取每个封面的链接以及名称
- 用requests对url进行请求
- 用bs4.BeautifulSoup,"html.parser"对网页源码进行解析
- 用select定位所需的内容
- 用re正则表达式对需要的子页面链接和名称进行提取
resp=requests.get(url,headers)
exSoup=bs4.BeautifulSoup(resp.text,'html.parser')
onePage=exSoup.select('h2')
oneLink=re.findall(r'<a href="(.*?)">',str(onePage))
oneNames=re.findall(r'html">(.*?)</a>',str(onePage))
3.二级页面
- 翻页&&访问
for ones in range(len(oneLink)):
header = {
"user-agent": random.choice(ualist)
}
print("\033[0;31m%s\033[0m" % f"------------正在下载{oneNames[ones]}小姐姐图片------------")
#访问二级页面
tworesp=requests.get(url=oneLink[ones],headers=header)
3.1提取图片链接
twoSoup=bs4.BeautifulSoup(tworesp.text,'html.parser')
twoPage=twoSoup.select('img[src]')
jpgs=re.findall(r'<img src="(.*?)"/>',str(twoPage))
3.2保存图片
#保存图片
for i in range(len(jpgs)):
print(f'{jpgs[i]}')
try:
jpg = requests.get(url=jpgs[i],headers=header)
with open(f'D:/爬取图片/萌妹子/{oneNames[ones]}{i}.jpg','wb') as f:
f.write(jpg.content)
time.sleep(0.1)
print(f'{oneNames[ones]}{i}.jpg下载完成!')
except:
print(f'{oneNames[ones]}{i}.jpg下载失败!')
三.代码及运行结果
1.完整代码
'''
2021-8-1
目标网站:https://imoemei.com/
网站爬取分类:https://imoemei.com/meinv/page/{n}
'''
import bs4,re,requests
import random,time
ualist = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3872.400 QQBrowser/10.8.4455.400",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
"Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; nb-NO) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14",
"Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Safari/605.1.15",
]
headers = {
"user-agent":random.choice(ualist)
}
for n in range(1,12):
url=f'https://imoemei.com/meinv/page/{n}'
resp=requests.get(url,headers)
exSoup=bs4.BeautifulSoup(resp.text,'html.parser')
onePage=exSoup.select('h2')
oneLink=re.findall(r'<a href="(.*?)">',str(onePage))
oneNames=re.findall(r'html">(.*?)</a>',str(onePage))
for ones in range(len(oneLink)):
header = {
"user-agent": random.choice(ualist)
}
print("\033[0;31m%s\033[0m" % f"------------正在下载{oneNames[ones]}小姐姐图片------------")
#访问二级页面
tworesp=requests.get(url=oneLink[ones],headers=header)
twoSoup=bs4.BeautifulSoup(tworesp.text,'html.parser')
twoPage=twoSoup.select('img[src]')
jpgs=re.findall(r'<img src="(.*?)"/>',str(twoPage))
#保存图片
for i in range(len(jpgs)):
print(f'{jpgs[i]}')
try:
jpg = requests.get(url=jpgs[i],headers=header)
with open(f'D:/爬取图片/萌妹子/{oneNames[ones]}{i}.jpg','wb') as f:
f.write(jpg.content)
time.sleep(0.1)
print(f'{oneNames[ones]}{i}.jpg下载完成!')
except:
print(f'{oneNames[ones]}{i}.jpg下载失败!')
tworesp.close()
resp.close()
print('\033[45m------------------------------------------------------------------')
print(f'\033[45m第{n}页全部保存完毕')
2.运行结果
3.成品欣赏
截止到目前正在爬第8页,已经有1700+图🤭🤭🤭🤭