【归档】python 爬虫记录每分钟的微博热搜榜

From 202012190815 +0:00

requirements.txt

beautifulsoup4==4.9.3
certifi==2020.12.5
chardet==4.0.0
idna==2.10
PyMySQL==0.10.1
requests==2.25.1
soupsieve==2.1
urllib3==1.26.2

code

# 微博热搜榜 每分钟更新一次
# 微博要闻榜 每分钟更新一次
# https://s.weibo.com/top/summary/summary?cate=realtimehot
# https://s.weibo.com/top/summary/summary?cate=socialevent

import requests
from bs4 import BeautifulSoup


def get_real_time_hot():
    url = "https://s.weibo.com/top/summary/summary?cate=realtimehot"
    r = requests.get(url=url)
    if r.status_code != 200:
        return False, []

    soup = BeautifulSoup(r.text, 'html.parser')
    data = soup.find("div", "data").find("tbody").find_all("tr")
    result = []
    for tr in data:
        rank_top = tr.find("td", "ranktop")
        if rank_top is None:
            continue
        rank_top_idx = int(rank_top.get_text())
        link = tr.find("a").get("href")
        title = tr.find("a").get_text()
        hot_num = int(tr.find("span").get_text())
        tag = ""
        if tr.find("i") is not None:
            tag = tr.find("i").get_text()
        result.append({
            "rank": rank_top_idx,
            "title": title,
            "hot": hot_num,
            "tag": tag,
            "link": "https://s.weibo.com" + link,
        })
    return True, result


if __name__ == "__main__":
    ok, res = get_real_time_hot()
    for item in res:
        print(item)

使用阿里云函数计算服务每分钟定时执行

# -*- coding: utf-8 -*-
import json
import logging
import os
import sys
from datetime import datetime

import pymysql
from collect import get_real_time_hot

logger = logging.getLogger()
conn = None


def connect_mysql():
    global conn
    try:
        conn = pymysql.connect(
            host=os.environ.get("MYSQL_HOST"),
            port=int(os.environ.get("MYSQL_PORT")),
            user=os.environ.get("MYSQL_USER"),
            passwd=os.environ.get("MYSQL_PASSWD"),
            db=os.environ.get("MYSQL_DB"),
            connect_timeout=5
        )
    except Exception as e:
        logger.error("ERROR: Unexpected error: Could not connect to MySql instance.")
        logger.error(e)
        sys.exit()


def initializer(context):
    connect_mysql()


def handler(event, context):
    global conn
    try:
        conn.ping()
    except:
        connect_mysql()

    series_id = datetime.now().strftime("%Y%m%d%H%M")
    ok, data = get_real_time_hot()

    cursor = conn.cursor()
    sql = "INSERT INTO realtimehot(`series_id`, `rank`, `title`, `hot`, `tag`, `link`) \
        VALUES (%s, %s, %s, %s, %s, %s)"
    val = []
    for item in data:
        val.append([series_id, item["rank"], item["title"], item["hot"], item["tag"], item["link"]])
    try:
        cursor.executemany(sql, val)
        conn.commit()
    except Exception as e:
        logger.error(e)
        conn.rollback()

    return json.dumps(data, ensure_ascii=False)

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值