python爬取微博热搜

最新推荐文章于 2024-08-02 17:01:43 发布

duzhongqiang

最新推荐文章于 2024-08-02 17:01:43 发布

阅读量799

点赞数 3

分类专栏：爬虫文章标签：深度学习

本文链接：https://blog.csdn.net/duzhongqiang/article/details/113783844

版权

爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

为了对 BeautifulSoup库进行测试，简单实现了微博热搜与百度新闻网页词条与链接的爬取，完整代码在（包括可执行文件）GitHub

环境

python3

依赖库安装

pip install beautifulsoup4
pip install urllib3

爬取微博热搜代码

# 抓取微博热搜词条信息
import urllib.request
from bs4 import BeautifulSoup 
import datetime
import time

def getdata(url):
    newsdata = []
    # 请求头
    herders={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1;WOW64) AppleWebKit/537.36 (KHTML,like GeCKO) Chrome/45.0.2454.85 Safari/537.36 115Broswer/6.0.3',
        'Connection':'keep-alive'}
    req=urllib.request.Request(url,headers=herders)
    response=urllib.request.urlopen(req)
    html=response.read().decode('utf8')
    bsObj = BeautifulSoup(html,'html.parser')
    curr_time = datetime.datetime.now() # 获取当前日期
    time_str = datetime.datetime.strftime(curr_time,'%Y-%m-%d %H:%M:%S')
    all_tags = bsObj.find_all("a")
    url2 = 'https://s.weibo.com'
    with open('weiBoSearch.txt', 'w') as f: 
        f.write(str(bsObj.head.title)[7:-8] + '\n')
        for tags in all_tags:
            url = str(tags.get("href"))
            name = str(tags.text)
            lenurl = len(url)
            lename = len(name)
            if url.find('q=') > 0 and name != '意见反馈':
                newurl = url2 + url
                f.write(time_str + ':  词条：' + name + '    链接：' + newurl + '\n')
    print(time_str + ' ' + str(bsObj.head.title)[7:-8]+ '爬取成功！\n')

if __name__ == '__main__':
    url='https://s.weibo.com/top/summary'
    while True:
        getdata(url)
        time.sleep(1)