使用selenium对彼岸网的4k壁纸进行爬取,为了给node后端写接口提供数据,大佬勿喷
1.用到了selenium的动作链,进行搜索框点击搜索,并且对下一页进行点击
2.用到了pyquery库进行dom解析,pyquery是爬虫解析的利器,用起来简单粗暴,里面的api的使用方法几乎和jQuery一样
3.这里将爬取下来的图片链接还有图片描述保存进了mongodb数据库
4.这里的selenium我用了接管浏览器的方式,避免被反反爬
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from pyquery import PyQuery as pq
import time
import pymongo
# 接管浏览器
chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
bro = webdriver.Chrome(executable_path='D:/pythonProject/Selenium/chromedriver.exe', options=chrome_options)
bro.maximize_window() #全屏
dbCol = input('数据库Photos的表名:')
myClient = pymongo.MongoClient('mongodb://localhost:27017/') #链接数据库
mydb = myClient['Photos'] # 链接数据库内的某个文档
mycol = mydb[dbCol]
smallPic = []
def search_box(name):
# 搜索框搜索内容
bro.get('https://pic.netbian.com/')
searchBox = bro.find_element_by_xpath('//*[@id="schform"]/p/input') # 找到搜索框
searchBox.send_keys(Keys.CONTROL, 'a') # 对搜索框进行全选按钮操作
searchBox.send_keys(Keys.BACK_SPACE) # 对搜索框内的内容进行清空
searchBox.send_keys(name) # 输入要搜索的内容
time.sleep(2)
searchBox.send_keys(Keys.ENTER) # 对搜索框进行回车操作
getPicLink()
# 获取原图链接保存起来,后面用于访问得到4k图片的链接
def getPicLink():
linkList = []
for i in range(40):
try:
time.sleep(2.5)
bro.execute_script("window.scrollTo(0,document.body.scrollHeight)") # 滚到底部
page = bro.page_source # 得到页面DOM
html = pq(page, parser='html') # 解析
allADom = html('.slist ul li a').items() # 得到关于图片信息的所有a链接
for a in allADom:
src = str(pq(a, parser='html')('img').attr('src'))
if src == 'None':
pass
else :
smallLink = 'https://pic.netbian.com' + src
link = 'https://pic.netbian.com' + a.attr('href') + '\n'
smallPic.append(smallLink)
linkList.append(link)
# 以下是用于搜索框爬虫的xpath
next = bro.find_element_by_xpath('//*[@id="main"]/div[3]/a[last()]') # 找到下一页的元素
actions(next) # 执行相应的点击跳转页面操作
# bro.get('https://pic.netbian.com/4kmeinv/index_'+ str(i + 2) +'.html')
except:
pass
f = open('piclink.txt', 'w')
for lk in linkList:
f.write(lk)
print(lk, '写入成功')
f.close()
getPic()
def getPic():
f = open('piclink.txt', 'r')
picLink = f.readlines() # 读取所有4k的链接
picLink = [x.strip() for x in picLink] # 去除换行符
f.close()
saveDb(picLink)
# 对4k链接进行访问,得到4k图片的真实地址
def saveDb(picLink):
db = []
id = 1
for link in picLink:
data = {}
try:
bro.get(link)
page = bro.page_source
html = pq(page, parser='html')
text = html('.photo-hd h1').text()
img = 'https://pic.netbian.com' + html('.photo-pic #img img').attr('src')
data['_id'] = id # 数据库内置id
data['id'] = id
data['desc'] = text
data['smallPic'] = smallPic[id-1]
data['picLink'] = img
db.append(data)
id = id + 1
time.sleep(2.5)
except:
pass
mycol.insert_many(db) # 保存进数据库
# 动作链相关操作
def actions(dom):
action = ActionChains(bro)
action.move_to_element(dom).click().perform()
action.reset_actions()
name = input('Search Something:')
search_box(name)