代码分享之利用selenium和多线程自动爬取美女图片
# @Time : 2021/5/30 17:21
# @Author : Dian
# @File : en
# @Software: PyCharm
from selenium import webdriver
from lxml import etree
import time
from fake_useragent import UserAgent
from queue import Queue
import requests
import threading
import os
driver = webdriver.Chrome()
class bingSpider(object):
def __init__(self):
self.baseurl = 'https://cn.bing.com/images/search?q=%e9%ab%98%e6%b8%85%e5%a3%81%e7%ba%b8%e7%be%8e%e5%a5%b3&qpvt=%e9%ab%98%e6%b8%85%e5%a3%81%e7%ba%b8%e7%be%8e%e5%a5%b3&form=IQFRML&first=1&tsc=ImageBasicHover'
# 创建队列
self.q = Queue()
self.lock = threading.Lock()
self.numb = 0
def getSource(self,tim1):
driver.get(self.baseurl)
time.sleep(2)
tim2 = 0
start = time.time()
while tim1 > tim2:
driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
end = time.time()
tim2 = end - start
time.sleep(3) #这里也需要注意等待时间
everyLie = etree.HTML(driver.page_source).xpath('//*[@id="mmComponent_images_1"]/ul')
for lie in everyLie:
allPicList = lie.xpath('./li')
self.numb = len(allPicList)
for i in allPicList:
everyPicUrl = i.xpath('./div/div/a/div/img/@src')[0]
if len(everyPicUrl)>120:
pass
else:
self.q.put(everyPicUrl)
def parse_html(self):
gen = (i for i in range(self.numb))
#注意这个路径最好自己改成自己想要的路径,同时with open里的路径也需要更改
filename = f'D:\python\爬虫\课外扩展\\2\\tupian3'
isExists = os.path.exists(filename)
if not isExists:
os.makedirs(filename)
while True:
# 上锁
self.lock.acquire()
if not self.q.empty():
url = self.q.get()
# 解锁
self.lock.release()
headers = {'User-Agent': UserAgent().random}
source = requests.get(url, headers=headers).content
n = next(gen)
# 这里面的路径需要更改
with open(f'D:\python\爬虫\课外扩展\\2\\tupian3\{n}.png','wb') as file_obj:
file_obj.write(source)
else:
self.lock.release()
break
def run(self):
t_lst = []
# 创建线程数
for i in range(5):
t = threading.Thread(target=self.parse_html)
t_lst.append(t)
t.start()
if __name__ == '__main__':
a = bingSpider()
a.getSource(10) #通过设置时间来设置爬取数量
a.run()
a.parse_html()
注:
1.import报红则需要pip
2.使用selenium控制chrome需要下载chromedriver,版本号需要匹配。
3.有不明白的地方可以留言哦,我一定会细心解答。