Pyf20230331(Python selenium)

TK1942

已于 2023-03-31 21:10:08 修改

阅读量156

点赞数

文章标签： python selenium 爬虫

于 2023-03-31 21:07:18 首次发布

本文链接：https://blog.csdn.net/TK1942/article/details/129886945

版权

01 selenium基础

from selenium.webdriver import Chrome

1.创建浏览器对象

b = Chrome()

2.打开网页（需要爬那个页面的数据，就打开那个页面对应的网页地址）

b.get('https://movie.douban.com/top250?start=0&filter=')

3.获取网页源代码（注意:不管以什么样的方式更新了界面内容，page_source的内容也会更新）

print(b.page_source)

print('----------------------------------------')
input('按任意键结束：')

02 selenium翻页

from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By

selenium获取多页数据翻页的方法

1.方法1：找到不同页地址的变化规律利用循环实现多页数据请求

# b = Chrome()
# for p in range(0, 76, 25):
#     b.get(f'https://movie.douban.com/top250?start={p}&filter=')
#     print(b.page_source)
#     print(f'====================第{p}页====================')
#
# input('输入任意键结束：')

### 2.方法2：点击翻页按钮，刷新页面内容，在刷新后获取网页源代码

b = Chrome()
b.get(f'https://movie.douban.com/top250?start=0&filter=')

for _ in range(5):
    print(b.page_source)
    # 点击翻页按钮
    next1 = b.find_element(By.CLASS_NAME, 'next')
    # 点击按钮翻页
    next1.click()

3.selenium获取标签

先导By：from selenium.webdriver.common.by import By
浏览器对象.find_element(获取方式) - 返回符合条件的第一个标签，结果是标签对象
浏览器对象.find_elements(获取方式) - 返回符合条件的所有标签，结果是列表，列表中的元素是标签对象

以下举例：
browser.find_element(By.ID,‘kw’)
browser.find_element(By.NAME,‘wd’)
browser.find_element(By.CLASS_NAME,‘s_ipt’)
browser.find_element(By.TAG_NAME,‘input’)
browser.find_element(By.LINK_TEXT,‘新闻’) - 通过a标签标签内容
browser.find_element(By.PARTIAL_LINK_TEXT,‘闻’) - 通过a标签部分标签内容
browser.find_element(By.XPATH,‘//*[@id=“kw”]’) -
browser.find_element(By.CSS_SELECTOR,‘#kw’)

4.操作标签

a.输入框输入内容：输入框对应的标签.send_keys(内容)
b.点击标签：标签对象.click()

03 selenium操作(中国知网)

from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
import time

1.创建浏览器

b = Chrome()

2.打开中国知网

b.get('https://www.cnki.net/')

3.获取输入框，输入"数据分析"

b.find_element(By.ID, 'txt_SearchText').send_keys('数据分析\n')
time.sleep(2)

# 获取搜索结果所有论文的标题标签
titles = b.find_elements(By.CLASS_NAME, 'fz14')

# 点击第一个搜索结果
titles[0].click()
time.sleep(2)

# 切换选项卡，让浏览器对象指向详情页
handles = b.window_handles
b.switch_to.window(handles[-1])
rowtit = b.find_element(By.ID, 'ChDivSummary')
print(rowtit)
# 关闭当前窗口
b.close()

# 切换回原选项卡
b.switch_to.window(handles[0])
titles[1].click()
time.sleep(2)
# 关闭当前窗口
b.close()

input('输入任意键结束：')
```python



## 04 selenium练习(中国知网)
```python
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.common.action_chains import ActionChains


def get_net_data(website, p):

    # 1.创建浏览器
    b = Chrome()

    # 2.打开中国知网
    b.get(website)

    # 3.获取输入框，输入"数据分析"
    b.find_element(By.ID, 'txt_SearchText').send_keys('数据分析\n')
    time.sleep(3)

    # pages = 0
    # while True:
    for _ in range(3):

        # 获取搜索结果所有论文的标题标签
        titles = b.find_elements(By.CLASS_NAME, 'fz14')

        for i in range(len(titles)):
            titles[i].click()
            time.sleep(2)
            handles = b.window_handles
            b.switch_to.window(handles[-1])
            rowtit = b.find_element(By.ID, 'ChDivSummary').text
            time.sleep(2)
            print(rowtit)
            b.close()
            handles = b.window_handles
            b.switch_to.window(handles[0])

        PageNext = b.find_element(By.ID, 'PageNext')
        PageNext.click()
        time.sleep(5)


        # left_click = b.find_element(By.LINK_TEXT, '下一页')
        # ActionChains(b).click(left_click).perform()
        # time.sleep(2)

        print(f'==========第{_+1}页完成==========')
        # pages += 1
        # if pages == p:
        #     break


    input('输入任意键结束：')


if __name__ == '__main__':
    url = 'https://www.cnki.net/'
    page = 3
    get_net_data(url, page)

05 selenium滚动操作

from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
import time

b = Chrome()
b.get('https://search.jd.com/Search?keyword=%E7%94%B5%E9%A5%AD%E9%94%85&enc=utf-8&wq=%E7%94%B5%E9%A5%AD%E9%94%85&pvid=20d97125d00a409fb95d2735aeb0a7c6')
time.sleep(2)

1.用代码控制浏览器滚动

js中页面滚动的代码：

for x in range(4):
    b.execute_script('window.scrollBy(0, 2000)')
    time.sleep(1)

result = b.find_elements(By.CSS_SELECTOR, '#J_goodsList .gl-item')
print(len(result))

input('按任意键结束：')

06 作业(爬京东电饭锅)

from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from csv import writer
import time


def save_net_data(items):
    f = open('files/JD.csv', 'w', encoding='utf-8', newline='')
    w1 = writer(f)
    w1.writerow(['商品信息', '价格', '评价', '店铺名称', '商品详情页地址', '店铺地址'])
    w1.writerows(items)
    f.close()


def get_net_data(url, search_key, page):
    b = Chrome()
    b.get(url)

    # 获取搜索框
    b.find_element(By.ID, 'key').send_keys(search_key, '\n')
    time.sleep(3)

    items = []

    # 每个列表页及翻页
    for p in range(page):
        # 滚动网页获取全部内容
        for i in range(3):
            b.execute_script('window.scrollBy(0, 2200)')
            time.sleep(1)

        # 获取所有物品标题
        p_names = b.find_elements(By.CLASS_NAME, 'gl-item')

        # 获取每个物品信息
        for x in p_names:
            p_name = x.find_element(By.CLASS_NAME, 'p-name').text
            p_name = ''.join(p_name.split('\n'))
            p_price = x.find_element(By.CSS_SELECTOR, '.p-price>strong>i').text
            p_commit = x.find_element(By.CSS_SELECTOR, '.p-commit>strong').text

            try:
                p_shop = x.find_element(By.CSS_SELECTOR, '.p-shop>span>a').text
            except:
                p_shop = '店铺名为空'

            item_href = x.find_element(By.CSS_SELECTOR, '.p-name>a').get_attribute('href')

            try:
                shop_href = x.find_element(By.CSS_SELECTOR, '.p-shop>span>a').get_attribute('href')
            except:
                shop_href = '店铺链接空'

            # 将每个物品信息封装成1个列表再装入一个大列表
            item = [p_name, p_price, p_commit, p_shop, item_href, shop_href]
            items.append(item)
            # print(p_name, p_price, p_commit, p_shop, item_href, shop_href)

        # 翻页操作
        pn_next = b.find_element(By.CLASS_NAME, 'pn-next')
        pn_next.click()
        time.sleep(5)

        # 调用存档函数
        save_net_data(items)

        print(f'==========第{p + 1}页完成==========')

    input('按任意键结束：')


if __name__ == '__main__':
    url = 'https://www.jd.com/'
    search_key = '电饭锅'
    page = 3
    get_net_data(url, search_key, page)
    ```

获取数据完成截图：
![在这里插入图片描述](https://img-blog.csdnimg.cn/aee1f6a94cda41938146a53ffeb6f027.jpeg#pic_center)