最简爬虫入门！附代码

啊啊喔喔喔啊啊

已于 2023-04-29 21:14:33 修改

阅读量475

点赞数

分类专栏： Python 文章标签：爬虫 python 网络爬虫

于 2023-04-29 21:06:58 首次发布

本文链接：https://blog.csdn.net/qq_49050902/article/details/130443580

版权

Python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

心血来潮，想把以前写的爬虫回顾一下。
环境：
谷歌浏览器、python、pycharm、selenium、xpath插件
最垃圾的程序员，欢迎指导

介绍一下，selenium这个库就是模拟电脑操作
需要谷歌浏览器和那个，chrome驱动，下载放在python目录里面
http://chromedriver.storage.googleapis.com/index.html
在这里插入图片描述

xpath就是可以定位元素的地址
比如，可以进入开发者工具然后选中某个元素，我选中某站搜索框
在这里插入图片描述

在右边的代码位置右键，COPY -> COPY XPATH
我这里复制下来是
//*[@id=“i_cecream”]/div/div[2]/div/div/div/div/input
这个就是搜索框在这个页面的“门牌号”

谷歌浏览器可以安装一个xpath helper
方便查看xpath对应的元素在不在，比如这样
在这里插入图片描述

好的，到这里你就可以开始尝试爬虫了
网页的xpath会变的，需要更改的


#先导入包，然后打开网页

from selenium import webdriver
import time
import pandas as pd

#加载浏览器出来
#定义所需全局变量
Vieourl=[]#存取一个页面所有视频链接
title=[]#存视频对应标题
urls_pre=[]#存每一个视频链接
dict_link={}#将视频链接与标题串起来

'''
webdriver通过使用
driver.find_element_by_xpath('').get_attribute('textContent')
来获取页面元素
'''

#定义获取url方法
def  get_url(KeyWord):
    chrome_options = webdriver.ChromeOptions()
    # chrome_options.add_argument('--headless')  # 无网页模式
    browser = webdriver.Chrome(options=chrome_options)

    # 跳转搜索页面
    browser.get("https://search.bilibili.com/?from_source=webtop_search")

    #用插件定位到搜索框的xpath
    browser.find_element_by_xpath('//*[@id="i_cecream"]/div/div[2]/div/div/div/div/input').send_keys(KeyWord)

    #直接执行execute_script会出现定位元素互相覆盖执行失败，包装一层来模拟点击鼠标
    element1 = browser.find_element_by_xpath('//*[@id="i_cecream"]/div/div[2]/div/div/div/div/button')
    browser.execute_script("arguments[0].click();", element1)

    time.sleep(2)  # 等待网页加载

    '''
    定义一个for循环来搜索有几页网页，点击下一页要点击几次
    由于这个xpath会变化，定义从2-6去匹配获取这个翻页次数的数字
    '''
    click_time=0
    for i in range(2,6):
        try:
            element = '//*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div/div[' + str(i) + ']/div/div/button[9]'
            click_time = browser.find_element_by_xpath(element).get_attribute('textContent')

        except:
            pass


        '''
        从第一页开始，到翻页次数，首先获取当页所有视频链接和标题，再通过字典把他们串起来
        '''
    print('获取到的页数：'.format(click_time))
#获取每一页的视频链接
    for i in range(1,int(click_time)):
        #video-list row xh-highlight
        index=[]
        xpath_urlsNo = '//*[@class="video-list row"]//div[1]/a'
        urls_preNo = browser.find_elements_by_xpath(xpath_urlsNo)
        for j in urls_preNo[:]:
            title.append(j.get_attribute("textContent"))
            Vieourl.append(j.get_attribute("href"))  # 存入链接
            index.append(j.get_attribute("href"))

        '''将其串起来'''
        for o in range(len(title)):
            dict_link.update({title[o]:index[o]})


        '''加入判断，和keyword进行对比匹配程度，一定要含有keyword才行'''
        # print('--执行匹配对比,删除无关视频链接--')
        for h in list(dict_link.keys()):
            if h.__contains__(KeyWord):
                pass
            else:
                del dict_link[h]

        #控制台输出查看

        # print(index)
        # print(title)
        # print(urls_preNo)

        #清空准备获取下一页
        print('--本{}页获取完毕，正在清空准备下一页--，一共{}页'.format(i,int(click_time)))
        title.clear()
        index.clear()
        urls_preNo.clear()


       # Vieourl.clear()  # 测试时候清空方便查看总的是否有改变
        #i为当前页数,注意，如果前面定义了i，这里会报错，前面不能用i
        # print(i)
        if i ==1:
            Nextone = browser.find_element_by_xpath(
                '//*[@class="vui_button vui_pagenation--btn vui_pagenation--btn-side"][1]')  # 下一页的Xpath
            browser.execute_script("arguments[0].click();", Nextone)  # 点击跳转下一页
            time.sleep(9)  # 等待网页加载
        else:
            Nextone = browser.find_element_by_xpath(
                '//*[@class="vui_button vui_pagenation--btn vui_pagenation--btn-side"][2]')  # 下一页的Xpath
            browser.execute_script("arguments[0].click();", Nextone)  # 点击跳转下一页
            time.sleep(9)  # 等待网页加载

    print('{}相关链接爬取完毕，生成文件'.format(KeyWord))
    print(dict_link)
    dataframe = pd.DataFrame(dict_link,index=[0])

    # 将DataFrame存储为csv,index表示是否显示行名，default=True
    #获取当前时间，命名文件为当前操作时间
    new_time=time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
    dataframe.to_csv(f"{KeyWord}相关视频链接文件{new_time}.csv", index=False, sep=',',encoding='utf-8')
    dict_link.clear()
    browser.close()

    print("每个视频的链接爬取完成")
    return Vieourl

def main():#主方法
   
    keyword=input("请输入要搜索的关键字：")
    get_url(keyword)
if __name__ == '__main__':
    main()