利用python抓取reddit数据的简单demo

最新推荐文章于 2025-05-22 20:07:45 发布

码农小小苏

最新推荐文章于 2025-05-22 20:07:45 发布

阅读量357

点赞数 6

分类专栏： Python 文章标签： python 开发语言

本文链接：https://blog.csdn.net/m0_75266675/article/details/145633927

版权

Python 专栏收录该内容

11 篇文章

订阅专栏

import praw
import pymongo
from datetime import datetime
from pymongo import MongoClient

# Reddit API 配置
reddit = praw.Reddit(
    client_id="你的client_id",
    client_secret="你的client_secret",
    user_agent="你的user_agent"
)

# MongoDB 连接配置
client = MongoClient('mongodb://localhost:27017/')
db = client['reddit_db']
collection = db['reddit_posts']

def scrape_reddit_data(subreddit_name, limit=10):
    """
    抓取指定subreddit的帖子数据
    
    Args:
        subreddit_name (str): 要抓取的subreddit名称
        limit (int): 要抓取的帖子数量
    """
    subreddit = reddit.subreddit(subreddit_name)
    
    for post in subreddit.hot(limit=limit):
        post_data = {
            'author': str(post.author),
            'title': post.title,
            'score': post.score,
            'url': post.url,
            'created_utc': datetime.fromtimestamp(post.created_utc),
            'post_id': post.id,
            'permalink': post.permalink,
            'num_comments': post.num_comments,
            'scraped_at': datetime.utcnow()
        }
        
        # 将数据存入MongoDB
        try:
            collection.update_one(
                {'post_id': post.id},  # 查找条件
                {'$set': post_data},   # 更新的数据
                upsert=True            # 如果不存在则插入
            )
            print(f"成功保存帖子: {post.title}")
        except Exception as e:
            print(f"保存帖子时出错: {str(e)}")

if __name__ == "__main__":
    # 使用示例
    subreddit_name = "python"  # 你想抓取的subreddit
    scrape_reddit_data(subreddit_name, limit=20)