selenium爬取4k壁纸，并存入数据库

最新推荐文章于 2024-04-19 22:31:50 发布

Mr-阿梓

最新推荐文章于 2024-04-19 22:31:50 发布

阅读量278

点赞数

分类专栏：笔记文章标签： selenium 爬虫 python 数据库

本文链接：https://blog.csdn.net/m0_55986526/article/details/120120340

版权

笔记专栏收录该内容

14 篇文章 0 订阅

订阅专栏

使用selenium对彼岸网的4k壁纸进行爬取，为了给node后端写接口提供数据，大佬勿喷

1.用到了selenium的动作链，进行搜索框点击搜索，并且对下一页进行点击
2.用到了pyquery库进行dom解析，pyquery是爬虫解析的利器，用起来简单粗暴，里面的api的使用方法几乎和jQuery一样
3.这里将爬取下来的图片链接还有图片描述保存进了mongodb数据库
4.这里的selenium我用了接管浏览器的方式，避免被反反爬

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from pyquery import PyQuery as pq
import time
import pymongo

# 接管浏览器
chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")

bro = webdriver.Chrome(executable_path='D:/pythonProject/Selenium/chromedriver.exe', options=chrome_options)


bro.maximize_window()	#全屏

dbCol = input('数据库Photos的表名:')

myClient = pymongo.MongoClient('mongodb://localhost:27017/')	#链接数据库
mydb = myClient['Photos']	# 链接数据库内的某个文档
mycol = mydb[dbCol]		

smallPic = []

def search_box(name):
    # 搜索框搜索内容
    bro.get('https://pic.netbian.com/')
    searchBox = bro.find_element_by_xpath('//*[@id="schform"]/p/input')     # 找到搜索框
    searchBox.send_keys(Keys.CONTROL, 'a')       # 对搜索框进行全选按钮操作
    searchBox.send_keys(Keys.BACK_SPACE)         # 对搜索框内的内容进行清空
    searchBox.send_keys(name)       # 输入要搜索的内容
    time.sleep(2)
    searchBox.send_keys(Keys.ENTER)      # 对搜索框进行回车操作

    getPicLink()

# 获取原图链接保存起来，后面用于访问得到4k图片的链接
def getPicLink():
    linkList = []

    for i in range(40):
        try:
            time.sleep(2.5)
            bro.execute_script("window.scrollTo(0,document.body.scrollHeight)")  # 滚到底部
            page = bro.page_source  # 得到页面DOM
            html = pq(page, parser='html')  # 解析
            allADom = html('.slist ul li a').items()  # 得到关于图片信息的所有a链接

            for a in allADom:
                src = str(pq(a, parser='html')('img').attr('src'))
                if src == 'None':
                    pass
                else :
                    smallLink = 'https://pic.netbian.com' + src
                    link = 'https://pic.netbian.com' + a.attr('href') + '\n'
                    smallPic.append(smallLink)
                    linkList.append(link)
            # 以下是用于搜索框爬虫的xpath
            next = bro.find_element_by_xpath('//*[@id="main"]/div[3]/a[last()]')  # 找到下一页的元素
            actions(next)  # 执行相应的点击跳转页面操作


            # bro.get('https://pic.netbian.com/4kmeinv/index_'+ str(i + 2) +'.html')
        except:
            pass



    f = open('piclink.txt', 'w')
    for lk in linkList:
        f.write(lk)
        print(lk, '写入成功')
    f.close()
    getPic()

def getPic():
    f = open('piclink.txt', 'r')
    picLink = f.readlines()     # 读取所有4k的链接
    picLink = [x.strip() for x in picLink]  # 去除换行符
    f.close()
    saveDb(picLink)

# 对4k链接进行访问，得到4k图片的真实地址
def saveDb(picLink):
    db = []
    id = 1
    for link in picLink:
        data = {}
        try:
            bro.get(link)
            page = bro.page_source
            html = pq(page, parser='html')
            text = html('.photo-hd h1').text()
            img = 'https://pic.netbian.com' + html('.photo-pic #img img').attr('src')
            data['_id'] = id    # 数据库内置id
            data['id'] = id
            data['desc'] = text
            data['smallPic'] = smallPic[id-1]
            data['picLink'] = img
            db.append(data)
            id = id + 1
            time.sleep(2.5)
        except:
            pass

    mycol.insert_many(db)   # 保存进数据库



# 动作链相关操作
def actions(dom):
    action = ActionChains(bro)
    action.move_to_element(dom).click().perform()
    action.reset_actions()


name = input('Search Something:')
search_box(name)