爬取
爬去某妹子网,只爬去四分类妹子第一页中的所有,let’go!
页面分析什么的我也懒得说,这是爬虫最基础的东西,我爬这个网就没有反爬措施,只是容易封IP,稍微注意点也没啥大问题。
唯一的反爬措施
其实大多时候你去爬取这个网站信息很简单,只有最后一步你可能翻车,下载图片时,它有一个反爬链,不过你在请求头里面加上就行。
headers = {
"User-Agent": ua.firefox,
'Referer': 'http://i.meizitu.net' #反扒链
}
下面直接上代码
import concurrent
import os
from concurrent.futures import ThreadPoolExecutor
import requests
from lxml import etree
from fake_useragent import UserAgent
import time
import random
ua = UserAgent()
page = 1
picname = 1
def mk_fenleidir_url():
headers = {
"User-Agent": ua.firefox,
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4"
}
res = requests.get("https://www.mzitu.com/", headers=headers).text
html = etree.HTML(res)
title = html.xpath('//*[@id="menu-nav"]/li/a/text()')[1:5]
urls = html.xpath('//*[@id="menu-nav"]/li/a/@href')[1:5]
for i in range(len(title)):
yield title[i], urls[i]
def intotime(url):
global page
headers = {
"User-Agent": ua.firefox,
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4"
}
res = requests.get(url, headers=headers).text
html = etree.HTML(res)
#获取图片标题
pictitle = html.xpath('//*[@id="pins"]/li/span/a/text()')
#获取图片第二层链接
pichref = html.xpath('//*[@id="pins"]/li/span/a/@href')
for i in range(len(pictitle)):
yield pictitle[i], pichref[i]
def download_pic(dir, picdir, url):
global picname
a = random.randint(1, 3)
headers = {
"User-Agent": ua.firefox,
'Referer': 'http://i.meizitu.net'
}
res = requests.get(url, headers=headers).text
html = etree.HTML(res)
nextpichref = html.xpath('/html/body/div[2]/div[1]/div[4]/a/@href')
nextpictext = html.xpath('/html/body/div[2]/div[1]/div[4]/a/span/text()')
nexttext = nextpictext[len(nextpictext) - 1]
picdownload = html.xpath('/html/body/div[2]/div[1]/div[3]/p/a/img/@src')
pic = requests.get(picdownload[0], headers=headers).content
if not os.path.exists('./meizitu/%s/%s/%s.jpg' % (dir, picdir, picname)):
with open('./meizitu/%s/%s/%s.jpg' % (dir, picdir, picname), 'wb')as f:
f.write(pic)
#通过nexttext内容判断是否继续爬去下一页,及下一页是否为这个图集内容
if nexttext == "下一页»":
picname += 1
nexturl = nextpichref[len(nextpichref) - 1]
#使用concurrent.futures模块创建线程
with concurrent.futures.ProcessPoolExecutor(max_workers=5) as exector:
exector.submit(download_pic, dir, picdir, nexturl)
time.sleep(a)
def download():
for i in mk_fenleidir_url():
os.makedirs('./meizitu/' + i[0])
for page in intotime(i[1]):
os.makedirs('./meizitu/%s/' % (i[0]) + page[0])
download_pic(i[0], page[0], page[1])
if __name__ == '__main__':
download()
=======================================
# 多协程爬虫
from bs4 import BeautifulSoup
import concurrent
from concurrent.futures import ThreadPoolExecutor
import requests, re, aiohttp, asyncio, time, aiofiles, os
from fake_useragent import UserAgent
"""
用bs4解析妹子网并下载图片,https://www.mzitu.com/japan/
"""
url = "https://www.mzitu.com/japan/"
list1 = list()
a = 1
def geturls(url):
global a
ua = UserAgent()
headers = {
"User-Agent": ua.random
}
html = requests.get(url=url, headers=headers).text
# print(html)
soup = BeautifulSoup(html, "lxml")
pageurls = soup.select('ul[id="pins"]> li > span>a')
nextpage = soup.select('a[class="next page-numbers"]')
# print(nextpage)
for i in nextpage:
if a <= 3:
a += 1
geturls(i.get('href'))
for url in pageurls:
# print(url.get('href'))
# print(url.get_text())
list1.append((url.get('href'), url.get_text()))
def pagedetail(url):
ua = UserAgent()
headers = {
"User-Agent": ua.random
}
html = requests.get(url=url, headers=headers).text
# print(html)
soup = BeautifulSoup(html, "lxml")
nexturl = soup.select('div[class="pagenavi"]>a>span')[-2]
print(int(nexturl.string) + 1)
for i in range(1, int(nexturl.string) + 1):
yield "".join((url, "/%s" % (i)))
def picurl(pageurl):
ua = UserAgent()
headers = {
"User-Agent": ua.random,
'Referer': 'http://i.meizitu.net'
}
html = requests.get(url=pageurl, headers=headers).text
# print(html)
soup = BeautifulSoup(html, "lxml")
imageurl = soup.select('div[class="main-image"]>p>a>img')[0]
url = imageurl.get('src')
yield url
async def download_pic(picdir, url):
ua = UserAgent()
headers = {
"User-Agent": ua.random,
'Referer': 'http://i.meizitu.net'
}
name = re.findall('https://i3.mmzztt.com/\d+/\d+/(\w+).jpg', url)
sem = asyncio.Semaphore(100)
with (await sem):
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers)as resp:
ym = resp.read()
async with aiofiles.open('./meizitu/%s/%s.jpg' % (picdir, name[0]), 'wb')as f:
await f.write(ym)
await session.close()
def download():
geturls(url)
for dirs in list1:
os.makedirs('./meizitu/' + dirs[1])
for pageurl in pagedetail(dirs[0]):
tasks = [asyncio.ensure_future(download_pic(dirs[1],urls)) for urls in picurl(pageurl)]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
if __name__ == '__main__':
download()
# with concurrent.futures.ProcessPoolExecutor(max_workers=5) as exector:
# exector.submit(download)
爬完后的效果图
图片我就不发了,我发了半天给我来句审核为通过色情???我就发了几个文件夹结构图片就成色情,我想笑,我还没发图片截图,哈哈!
问题
写着写着发现耦合性太高,也不想再管了,图片名称问题,以数字1开始,每个主题中都应该是1开始,后来发现我忽略在循环中的问题,懒得改了。
这个爬虫的目的
写这个爬虫demo主要这只是为了验证concurrent.futures创建线程的方法,所以其他都不是我的目的。我也不谈这个模块的底层原理,因为我自己没研究源码!哈哈哈。