微博热搜和今日热榜爬虫

最新推荐文章于 2024-08-02 17:01:43 发布

whatzhang007

最新推荐文章于 2024-08-02 17:01:43 发布

阅读量1k

点赞数

分类专栏： Python 文章标签：爬虫 python 微博今日热榜

本文链接：https://blog.csdn.net/whatzhang007/article/details/120294376

版权

Python 专栏收录该内容

5 篇文章

订阅专栏

网站：
weibo_url = ‘https://s.weibo.com’
today_url = ‘https://tophub.today’

# 获取热搜源码
import json
import re

import requests as requests

weibo_url = 'https://s.weibo.com'
today_url = 'https://tophub.today'


def weibo(json_list=[]):
    regex = re.compile(
        r'<tr class="">\s+<td class="td-01 ranktop">(\d+)</td>\s+<td class="td-02">\s+<a href="(\S+)" target="_blank">('
        r'.*?)</a>\s+<span>(.*?)</span>\s+</td>\s+<td class="td-03">.*</td>\s+</tr>')
    lists = regex.findall(get_html(weibo_url + '/top/summary'))
    for vo in lists:
        json_list.append(dict(num=vo[0], url=weibo_url + vo[1], key=vo[2], hotNum=vo[3].strip()))
    print(json.dumps(json_list, indent=2, ensure_ascii=False))


def get_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/70.0.3538.110 Safari/537.36'}
    response = requests.get(url, headers=headers)
    return response.text


def top_hub_today(all=[]):
    # 获取分类
    regex = re.compile(r'href="(\S+)"><div class="gb-c">(\S+)</div>')
    hh = get_html(today_url)
    title = []
    lists = regex.findall(hh)
    for vo in lists:
        if vo[0] != '/':
            title.append(dict(title=vo[1], url=vo[0]))
    # 根据分类获取数据
    div_regex = re.compile(
        r'<div class="cc-cd-is">\s+<a href="(.*?)">\s+<div class="cc-cd-lb"><img.*>(.*?)</div>\s+</a>\s+</div>\s+<div '
        r'class="cc-cd-sb">\s+<div class="cc-cd-sb-ss cc-cd-sb-ss-ia">\s+<span class="cc-cd-sb-st">('
        r'.*?)</span>\s+</div>\s+</div>\s+</div>\s+<div class="cc-cd-cb nano">([\S\s]+?)</div>\s+</div>['
        r'\S\s]+?homepage="(\S+)" hashid="(\S+)"')
    div_top_regex = re.compile(
        r'<a href="(.*?)" target="_blank" rel="nofollow" itemid="\S+">\s+<div class="cc-cd-cb-ll">\s+'
        r'<span class="[s h]+">(\d+)</span>\s+<span class="t">(.*?)</span>\s+<span class="e">(.*?)</span>')

    for sub in title:
        print(sub['title'] + " 开始爬呀爬")
        page = 1
        subs = []
        while True:
            query_url = today_url + sub['url'] + '?p=' + str(page)
            print("---> url=[%s], 页数=[%d]" % (query_url, page))
            sub_html = get_html(query_url)
            div_list = div_regex.findall(sub_html)
            if div_list:
                page += 1
                top_list = []
                for div in div_list:
                    # 解析排行榜
                    tops = div_top_regex.findall(div[3])
                    [top_list.append(dict(url=vo[0], indx=vo[1], key=vo[2], hotNum=vo[3])) for vo in tops]
                subs.append(dict(app=div[1].strip(), app_url=div[4], url=div[0], hash_id=div[5], top_list=top_list))
            else:
                print("---> 页数结束")
                break
        all.append(dict(name=sub['title'], url=today_url + sub['url'], sub=subs))
        print(sub['title'] + " 结束")

    # print(json.dumps(all, indent=4, ensure_ascii=False))
    save_data(all)
    print('保存数据完成!')


def save_data(data=[]):
    try:
        with open('/Users/zhang/Desktop/hot.json', 'w', encoding='utf-8') as fs:
            json.dump(data, fs, indent=4, ensure_ascii=False)
    except IOError as e:
        print(e)


if __name__ == '__main__':
    weibo()
    top_hub_today()