爬取网页页面信息技术交流

最新推荐文章于 2024-10-08 12:37:10 发布

a_blue_fat

最新推荐文章于 2024-10-08 12:37:10 发布

阅读量341

点赞数 4

文章标签： python

本文链接：https://blog.csdn.net/2301_80120329/article/details/140258962

版权

完整代码


# 文件名称 爬取武汉大学
from module.write_and_read import dump_excel
import requests
from lxml import html

# 存在获取需要内容的列表
head_list = []


def main(url, page, num):
    # 地址
    url = url

    headers = {'User-Agent':
                '自己电脑的用户代理',
                }

    # 请求数据
    resp = requests.get(url, headers=headers)

    # 解析网页
    text = html.etree.HTML(resp.text)

    #获取需要内容所在标签
    headline_list = text.xpath('//ul[@class="p-list10"]/li/div[@class="txt"]')
    print(headline_list)
    print(len(headline_list))

    # 循环标签列表
    for headline in headline_list:
        headline1 = headline.xpath('./h4[@class="tit"]/a[@href]/text()')
        head_list.append(headline1)
        headline1 = headline.xpath('./h4[@class="tit"]/a[@target="_blank"]/@href')
        head_list[num].extend(["http://journal.whu.edu.cn" + headline1[0]])
        headline1 = headline.xpath('./div[@class="date"]/text()')
        head_list[num].extend(headline1)
        num += 1

    print(head_list)
    return num


if __name__ == '__main__':
    # 计算内容保存列表位置
    num = 0

    # 爬取页数
    page = 0
    for i in range(1,10):
        # 网页地址
        url = f"http://journal.whu.edu.cn/news/index/page/{i}"
        num = main(url, page, num)

    # 定义列表头
    headers = ['标题', '网址', '时间']

    # 保存到excel表
    dump_excel(headers,head_list, "test1")
    pass