实战2、爬取国内新闻_环球网-正则解析

本文链接：https://blog.csdn.net/sjc122333/article/details/142403524

1、寻找文章对应链接

寻找文章url

我们发现并不是一段完整的url，抓包进行分析
发现一段带有参数的请求，而且网页并没有翻页操作按键，所以我们推测是懒加载，向下滑动网页分析链接。

刷新1 刷新2

发现offset参数发生改变，由后面参数推测每次最多加载24个

# url链接

url = f"https://china.huanqiu.com/api/list?node=%22/e3pmh1nnq/e3pmh1obd%22,%22/e3pmh1nnq/e3pn61c2g%22,%22/e3pmh1nnq/e3pn6eiep%22,%22/e3pmh1nnq/e3pra70uk%22,%22/e3pmh1nnq/e5anm31jb%22,%22/e3pmh1nnq/e7tl4e309%22&offset={i}&limit=24"

2、解析内容

获取示例文本

import requests

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}
url = f"https://china.huanqiu.com/api/list?node=%22/e3pmh1nnq/e3pmh1obd%22,%22/e3pmh1nnq/e3pn61c2g%22,%22/e3pmh1nnq/e3pn6eiep%22,%22/e3pmh1nnq/e3pra70uk%22,%22/e3pmh1nnq/e5anm31jb%22,%22/e3pmh1nnq/e7tl4e309%22&offset=0&limit=24"
response = requests.get(url=url, headers=headers)
response_text = response.content.decode("utf8")
text = response_text

获取文章链接

urls = []

url_ = re.findall('"aid": ".*?",', response_text)
    for u in url_:
        url = "https://china.huanqiu.com/article/" + re.findall("4\\w*", u)[0]
        urls.append(url)

获取文章标题

titles = []

title_ = re.findall('"title": ".*?",', response_text)
    for t in title_:
        title = re.search('("title"): "(.*?)"', t).group(2)
        titles.append(title)

获取文章作者

authors = []

author_ = re.findall('"name":".*?"', response_text)
    for a in author_:
        author = re.search('("name"):"(.*?)"', a).group(2)
        authors.append(author)

获取文章内容

articles = []

for article_url in urls:
    text_ = []
    article = requests.get(url=article_url, headers=headers)
    article_response = article.content.decode("utf8")
    article_text1 = re.findall('<section data-type="rtext"><p>.*</p>', article_response)  # 匹配文章内容
    article_text2 = [re.sub(r'<[^>]+>', '', content) for content in article_text1]  # 去除文章的html标签
    # 打印提取的文本内容
    for text in article_text2:
        text_.append(text)
        article_ = text_[0]
        articles.append(article_)

3、保存获取内容

with open("前24条新闻数据.txt", "w", encoding="utf-8") as f:
    # for i in range(0, len(info_str), 80): # 每隔80个字进行分行
    #     f.write(info_str[i: i + 40] + "\n")
    for i in info_list: # 按照内容分行
        f.write(str(i)+"\n")