day24-爬虫4(selenium）

最新推荐文章于 2023-03-31 18:41:46 发布

??fengyu

最新推荐文章于 2023-03-31 18:41:46 发布

阅读量306

点赞数

文章标签： python

本文链接：https://blog.csdn.net/qq_46137199/article/details/117373335

版权

                    
                        
                    
                    总结 
 # selenium基本用法

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

# 1.创建浏览器对象
# 注意：浏览器对象如果是局部变量，操作结束后浏览器会自动关闭， 如果是全局变量则浏览器需要手动关闭
# web = webdriver.Chrome()

# 2. 打开网页
# web.get('https://www.jd.com')
# web.close()       可以关闭浏览器

with webdriver.Chrome() as web:
    web.get('https://www.jd.com')
    # 最大化窗口
    # web.maximize_window()

# 3. 获取网页数据
# 注意：在获取浏览器page_source值的时候，只能获取到当前浏览器已经加载出来的数据
    print(web.page_source)

# 4. 获取和操作标签
# 1) 输入框操作： 获取输入框  ->   输入内容    ->  按回车
    # 获取输入框
    input = web.find_element_by_id('key')
    # 输入内容
    input.send_keys('笔记本电脑')
    # 按回车键
    input.send_keys(Keys.ENTER)

    time.sleep(0.1)

    # 获取输入框
    input2 = web.find_element_by_id('key')
    # 情况原来输入框中的文字
    input2.clear()
    # input2.send_keys(Keys.BACKSPACE*10)
    # 输入内容
    input2.send_keys('86键盘')
    # 按回车键
    search = web.find_element_by_css_selector('#search > div > div.form > button')
    search.click()
    # input2.send_keys(Keys.ENTER)

    time.sleep(0.1)

# 5. 回退和前进
    web.back()
    time.sleep(0.1)
    web.forward()

    time.sleep(60)
 
 # selenium选项卡
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys
with webdriver.Chrome() as web:
# web = webdriver.Chrome()
    web.get('https://www.jd.com')

# 获取秒杀对应的超链接标签
    miaosha = web.find_element_by_xpath('//*[@id="navitems-group1"]/li[1]/a')
    miaosha.click()

    time.sleep(3)
    web.set_page_load_timeout(10)
    web.switch_to.window(web.window_handles[0])


    time.sleep(100)
 
 # selenium页面滚动

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
from lxml import etree
import re
import time
import csv


# 页面滑动
def page_generator():
    web = webdriver.Chrome()
    web.get('https://www.jd.com')
    # web.get('https://search.jd.com/Search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91&pvid=48eda17852a64eba9eff17ee98858810&page=75&s=2221&click=0')
    web.maximize_window()

    input = web.find_element_by_id('key')
    input.send_keys('笔记本电脑')
    input.send_keys(Keys.ENTER)

    # web.set_page_load_timeout(10)
    old_url = web.current_url
    while True:
        time.sleep(0.1)
        max_height = 10000
        position = 200
        while True:
            web.execute_script(f'window.scrollTo(0, {position})')
            position += 200
            if position > max_height:
                break
            time.sleep(0.1)
        yield web.page_source
        body = web.find_element_by_tag_name('body')
        body.send_keys(Keys.RIGHT)
        # next_btn = web.find_element_by_css_selector('#J_bottomPage > span.p-num > a.pn-next')
        # next_btn.click()
        time.sleep(1)
        if old_url == web.current_url:
            break
        old_url = web.current_url
    web.close()


# with open(r'./temp.html', 'w', encoding='utf-8') as f:
#     f.write(html)
#
# with open(r'./temp.html', 'r', encoding='utf-8') as f:
#     html = f.read()

# soup = BeautifulSoup(html, 'lxml')
# li_s = soup.select('#J_goodsList > ul > li')

# for li in li_s:
#     title = li.select_one('div > div.p-name.p-name-type-2 > a > em').get_text()
#     detail = li.select_one('div > div.p-img > a').attrs['href']
#     img = li.select_one('div > div.p-img > a > img').attrs['src']
#     price = li.select_one('div > div.p-price').get_text()
#     comment = li.select_one('div > div.p-commit > strong').get_text()
#     # comment_url = li.select_one('div > div.p-commit > strong > a').atrrs['href']
#     business = li.select_one('div > div.p-shop > span > a').get_text()
#     print(title, detail, img, price, comment, business)

def analysis_data(content: str, all_list: list):
    html = etree.HTML(content)
    li_s = html.xpath('/html/body/div[5]/div[2]/div[2]/div[1]/div/div[2]/ul/li')
    https = 'https:'
    for li in li_s:
        title = li.xpath('./div/div[3]/a//*/text()')
        detail = https + li.xpath('./div/div[1]/a/@href')[0]
        img = li.xpath('./div/div[1]/a/img/@src')
        img = https + img[0] if img else ''
        price = li.xpath('./div/div[2]/strong/i/text()')
        price = price[0] if price else ''
        comment_num = li.xpath('./div/div[4]/strong/a/text()')
        comment_num = comment_num[0] if comment_num else ''
        comment_url = li.xpath('./div/div[4]/strong/a/@href')
        comment_url = https + comment_url[0] if comment_url else ''
        business = li.xpath('./div/div[5]/span/a/text()')
        business = business if business else ''
        tags = li.xpath('./div/div[6]/i/text()')
        tags = tags if tags else ''
        one_list = ['|'.join([re.sub(r'\s+', '', x) for x in title]), detail, img, price, comment_num, comment_url,
                    business, '|'.join(tags).strip()]
        all_list.append(one_list)
        # print('|'.join([re.sub(r'\s+', '', x) for x in title]), detail, img, price, comment_num, comment_url, business, '|'.join(tags).strip())


if __name__ == '__main__':
    data_list = []
    page = page_generator()

    with open(r'./jd.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['title', 'detail', 'img', 'price', 'comment_num', 'comment_url', 'business', 'tags'])

    with open(r'./jd.csv', 'a', newline='', encoding='utf-8') as f:
        while True:
            try:
                analysis_data(next(page), data_list)
            except StopIteration:
                break
            finally:
                writer = csv.writer(f)
                writer.writerows(data_list)
                data_list.clear()

 

                

??fengyu

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
day24-爬虫4(selenium）

总结# selenium基本用法from selenium import webdriverfrom selenium.webdriver.common.keys import Keysimport time# 1.创建浏览器对象# 注意：浏览器对象如果是局部变量，操作结束后浏览器会自动关闭，如果是全局变量则浏览器需要手动关闭# web = webdriver.Chrome()# 2. 打开网页# web.get('https://www.jd.com')# web.clos
复制链接

扫一扫