selenium爬取搜狗网站新闻内容的小Demo

Tian丶Yuting

于 2024-08-15 15:03:26 发布

阅读量187

点赞数 2

文章标签： selenium python 爬虫

本文链接：https://blog.csdn.net/weixin_45707730/article/details/141222658

版权

selenium Version: 3.141.0
lxml Version: 4.6.2

from lxml.html import etree
from selenium import webdriver
import pandas as pd

headers = {
    'User-Agent': 'USER-AGENT'	# 自己浏览器的user-agent
}


def sougou_requests(url):
    browser.get(url=f'{url}')
    html = browser.page_source
    tree = etree.HTML(html)
    title = tree.xpath('//title/text()')[0]     # 文章title
    box_list = tree.xpath('/html/body/div[2]/div[2]/div[2]/div/div[1]/div[2]/section[3]/section')   # 文章内容
    box_list = box_list[0].getchildren()    # 获取下级所有标签
    big_title, small_title = '', ''    # 大小标题留存（被覆盖表明进入下一个标题）
    table_list = []
    for box in box_list:
        if box.tag == 'section':    # 获取section标签
            big_title = box.xpath('string()')   # 获取标签里面的内容，使用string()不会保留换行以及其他符号，使用text()则需要使用str的splace替换符号
        elif box.tag == 'p':    # 如果是P标签，则证明是内容
            if box.xpath('string()'):   # 如果P标里面有内容（此步为跳过换行符<br />）
                style = box.xpath('./span')[0].get('style')     # 获取span标签的style样式
                if 'color: rgb(0, 176, 240)' in style or 'color:#00b0f0' in style or 'color: #00b0f0' in style:     # 小标题字体颜色(也特喵是个人才，不能使用统一的么。。。)
                    small_title = box.xpath('string()')     # 获取小标题内容，覆盖for循环外面的变量
                else:
                    content = box.xpath('string()')     # 反之则代表小标题下面的内容
                    table_list.append({'big_title': big_title, 'small_title': small_title, 'content': content})     # 到内容后将一个循环添加到table_list中
    # 使用pandas导出excel文件
    pd.DataFrame(table_list).to_excel('./新闻.xlsx')


if __name__ == '__main__':
    first_url = 'https://mp.weixin.qq.com/s?src=11&timestamp=1723687078&ver=5445&signature=PK5iGUl7HUh0PaTKxqKO1SIKOze6OEsK2qeQi8xCCWCR6tCCkK6*' \
                '6Eymz2j6w2BkpnRoLeWfdU-f3T6f5X2BaHd8QgT1HHe0Pja-En804EJwAr8AqD4ASraXQ90l5Vy7&new=1'
    browser = webdriver.Chrome()
    sougou_requests(first_url)
    browser.close()