day24-爬虫4(selenium)

总结

  • # selenium基本用法
    
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    import time
    
    # 1.创建浏览器对象
    # 注意:浏览器对象如果是局部变量,操作结束后浏览器会自动关闭, 如果是全局变量则浏览器需要手动关闭
    # web = webdriver.Chrome()
    
    # 2. 打开网页
    # web.get('https://www.jd.com')
    # web.close()       可以关闭浏览器
    
    with webdriver.Chrome() as web:
        web.get('https://www.jd.com')
        # 最大化窗口
        # web.maximize_window()
    
    # 3. 获取网页数据
    # 注意:在获取浏览器page_source值的时候,只能获取到当前浏览器已经加载出来的数据
        print(web.page_source)
    
    # 4. 获取和操作标签
    # 1) 输入框操作: 获取输入框  ->   输入内容    ->  按回车
        # 获取输入框
        input = web.find_element_by_id('key')
        # 输入内容
        input.send_keys('笔记本电脑')
        # 按回车键
        input.send_keys(Keys.ENTER)
    
        time.sleep(0.1)
    
        # 获取输入框
        input2 = web.find_element_by_id('key')
        # 情况原来输入框中的文字
        input2.clear()
        # input2.send_keys(Keys.BACKSPACE*10)
        # 输入内容
        input2.send_keys('86键盘')
        # 按回车键
        search = web.find_element_by_css_selector('#search > div > div.form > button')
        search.click()
        # input2.send_keys(Keys.ENTER)
    
        time.sleep(0.1)
    
    # 5. 回退和前进
        web.back()
        time.sleep(0.1)
        web.forward()
    
        time.sleep(60)
    
  • # selenium选项卡
    from selenium import webdriver
    import time
    from selenium.webdriver.common.keys import Keys
    with webdriver.Chrome() as web:
    # web = webdriver.Chrome()
        web.get('https://www.jd.com')
    
    # 获取秒杀对应的超链接标签
        miaosha = web.find_element_by_xpath('//*[@id="navitems-group1"]/li[1]/a')
        miaosha.click()
    
        time.sleep(3)
        web.set_page_load_timeout(10)
        web.switch_to.window(web.window_handles[0])
    
    
        time.sleep(100)
    
  • # selenium页面滚动
    
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.support.ui import WebDriverWait
    from bs4 import BeautifulSoup
    from lxml import etree
    import re
    import time
    import csv
    
    
    # 页面滑动
    def page_generator():
        web = webdriver.Chrome()
        web.get('https://www.jd.com')
        # web.get('https://search.jd.com/Search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91&pvid=48eda17852a64eba9eff17ee98858810&page=75&s=2221&click=0')
        web.maximize_window()
    
        input = web.find_element_by_id('key')
        input.send_keys('笔记本电脑')
        input.send_keys(Keys.ENTER)
    
        # web.set_page_load_timeout(10)
        old_url = web.current_url
        while True:
            time.sleep(0.1)
            max_height = 10000
            position = 200
            while True:
                web.execute_script(f'window.scrollTo(0, {position})')
                position += 200
                if position > max_height:
                    break
                time.sleep(0.1)
            yield web.page_source
            body = web.find_element_by_tag_name('body')
            body.send_keys(Keys.RIGHT)
            # next_btn = web.find_element_by_css_selector('#J_bottomPage > span.p-num > a.pn-next')
            # next_btn.click()
            time.sleep(1)
            if old_url == web.current_url:
                break
            old_url = web.current_url
        web.close()
    
    
    # with open(r'./temp.html', 'w', encoding='utf-8') as f:
    #     f.write(html)
    #
    # with open(r'./temp.html', 'r', encoding='utf-8') as f:
    #     html = f.read()
    
    # soup = BeautifulSoup(html, 'lxml')
    # li_s = soup.select('#J_goodsList > ul > li')
    
    # for li in li_s:
    #     title = li.select_one('div > div.p-name.p-name-type-2 > a > em').get_text()
    #     detail = li.select_one('div > div.p-img > a').attrs['href']
    #     img = li.select_one('div > div.p-img > a > img').attrs['src']
    #     price = li.select_one('div > div.p-price').get_text()
    #     comment = li.select_one('div > div.p-commit > strong').get_text()
    #     # comment_url = li.select_one('div > div.p-commit > strong > a').atrrs['href']
    #     business = li.select_one('div > div.p-shop > span > a').get_text()
    #     print(title, detail, img, price, comment, business)
    
    def analysis_data(content: str, all_list: list):
        html = etree.HTML(content)
        li_s = html.xpath('/html/body/div[5]/div[2]/div[2]/div[1]/div/div[2]/ul/li')
        https = 'https:'
        for li in li_s:
            title = li.xpath('./div/div[3]/a//*/text()')
            detail = https + li.xpath('./div/div[1]/a/@href')[0]
            img = li.xpath('./div/div[1]/a/img/@src')
            img = https + img[0] if img else ''
            price = li.xpath('./div/div[2]/strong/i/text()')
            price = price[0] if price else ''
            comment_num = li.xpath('./div/div[4]/strong/a/text()')
            comment_num = comment_num[0] if comment_num else ''
            comment_url = li.xpath('./div/div[4]/strong/a/@href')
            comment_url = https + comment_url[0] if comment_url else ''
            business = li.xpath('./div/div[5]/span/a/text()')
            business = business if business else ''
            tags = li.xpath('./div/div[6]/i/text()')
            tags = tags if tags else ''
            one_list = ['|'.join([re.sub(r'\s+', '', x) for x in title]), detail, img, price, comment_num, comment_url,
                        business, '|'.join(tags).strip()]
            all_list.append(one_list)
            # print('|'.join([re.sub(r'\s+', '', x) for x in title]), detail, img, price, comment_num, comment_url, business, '|'.join(tags).strip())
    
    
    if __name__ == '__main__':
        data_list = []
        page = page_generator()
    
        with open(r'./jd.csv', 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['title', 'detail', 'img', 'price', 'comment_num', 'comment_url', 'business', 'tags'])
    
        with open(r'./jd.csv', 'a', newline='', encoding='utf-8') as f:
            while True:
                try:
                    analysis_data(next(page), data_list)
                except StopIteration:
                    break
                finally:
                    writer = csv.writer(f)
                    writer.writerows(data_list)
                    data_list.clear()
    
    
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值