python学习笔记（6）——5家公司新浪新闻挖掘练习-CSDN博客

本文链接：https://blog.csdn.net/Ama_tor/article/details/122767320

代码练习：

import requests #引用requests库
import re#引用re库
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'}

#{思路4、定义函数}
def sina(company):
    url = 'https://search.sina.com.cn/?country=usstock&q='+company+'&c=news&ie=utf-8'  # 新浪财经的新闻搜索网址经过删减处理
    res = requests.get(url, headers=headers, timeout=10).text
    # {思路1、print(res)——第一步获取网页源代码测试，如果没有乱码则第一步成功。}

    # 正则表达式提取新闻网址，标题，日期和来源（截取相关内容片段字符串，并观察规律用非贪婪匹配较精准提取数据）
    p_href = '<h2><a href="(.*?)" target="_blank">'
    p_title = '<h2><a href=".*?" target="_blank">(.*?)</a>'
    p_date = '<span class="fgray_time">(.*?)</span>'
    href = re.findall(p_href, res, re.S)
    title = re.findall(p_title, res, re.S)
    date = re.findall(p_date, res, re.S)
    # {思路2、print(href,title,date)#——第二步正则提取运行测试，观察所提取的内容，初步看效果}

    # {思路3、数据清洗}
    for i in range(len(href)):
        title[i] = re.sub('<.*?>', '', title[i])
        date[i] = date[i].split(' ')[1]
        print(str(i + 1) + '.' + title[i] + '-' + date[i])
        print(href[i])

#{思路5：调用函数}
companys = ['阿里巴巴', '万科', '京东', '恒大','百度']
for i in companys:
    try:#try/except异常处理语句避免程序异常中断
        sina(i)
        print(i + '新浪新闻爬取成功'+'\n')
    except:
        print(i+'新浪新闻爬取失败'+'\n')

运行结果：