selenium+bs4库爬取京东商城戴尔显示器数据

最新推荐文章于 2019-11-05 16:59:01 发布

moshanghuali

最新推荐文章于 2019-11-05 16:59:01 发布

阅读量323

点赞数

分类专栏：爬虫小项目文章标签： selenium bs4 python

本文链接：https://blog.csdn.net/moshanghuali/article/details/89278954

版权

爬虫小项目专栏收录该内容

2 篇文章 0 订阅

订阅专栏

def main():
    from spider_re import get_html, get_info, save_info
    url = 'https://item.jd.com/2316993.html#comment'
    html_list = get_html.get_html(url, 20)
    data = get_info.get_info(html_list)
    print('一共抓取{}条数据'.format(len(data)))
    print('数据正在保存中，请稍等。。。。')
    save_info.save_info(data)


if __name__ == '__main__':
    main()
    print('数据已经存储完毕')

def get_html(url,page_num):
    """
    url: 要爬取的url
    page_num: 要爬取的页数
    """
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    import time
    # 创建chrome浏览器驱动，无头模式
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    # 谷歌文档提到需要加上这个属性来规避bug
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(chrome_options=chrome_options)
    html_list = []
    # 跳转到指定界面
    driver.get(url)
    time.sleep(2)
    # 点击商品评价
    driver.find_element_by_xpath('//*[@id=\"detail\"]/div[1]/ul/li[5]').click()
    time.sleep(1)
    for i in range(page_num):
        html_list.append(driver.page_source)
        print('正在获取第{}个界面'.format(i+1))
        time.sleep(2)
        try:
            # 点击下一页，以更新当前页面中的评价内容
            driver.find_element_by_xpath('//div[@id=\"comment-0\"]/div[13]/div/div/a[@class=\"ui-pager-next\"]')
            time.sleep(1)
        except Exception as e:
            print(e)
    # 退出浏览器
    driver.quit()
    return html_list

def get_info(html_list):
    """"使用bs库解析网页并提取数据"""
    from bs4 import BeautifulSoup
    total_info = []
    for content in html_list:
        soup = BeautifulSoup(content, 'html.parser')
        try:
            # 获取每一页的包含评价在内的10部分内容
            datas = soup.select(r'#comment-0 > div.comment-item')
        except Exception as e:
            raise e
        # 遍历每部分的内容
        for data in datas:
            try:
                # 获取评论内容
                comment_text = data.find_all('p', {'class': 'comment-con'})
                # 判断是否有追加评论
                if len(comment_text) == 2:
                    comment_append = comment_text[1].string
                else:
                    comment_append = 'None '
            except Exception as e:
                raise e
            # 评论时间
            comment_time = data.select(r'div.comment-message > div.order-info > span')[1].get_text()
            # 订单类型
            order_type = data.select(r'div.comment-message > div.order-info > span')[0].get_text()
            # 总数据
            total_info.append([comment_time, order_type, comment_text[0].string, comment_append])

    return total_info

def save_info(total_info):
    """导入到excel中去"""
    import pandas as pd
    dataseris = pd.DataFrame(total_info,columns=['评论时间', '订单类型', '评论内容', '追加评论'])
    dataseris.to_excel('E:\\info.xlsx')

结果：