爬取内容主要是csdn论坛帖子,帖子详情(评论回复),用户信息
1、生成3张数据库表:
from peewee import *
db = MySQLDatabase("spider", host="127.0.0.1", port=3306, user="root", password="123456")
class BaseModel(Model):
class Meta:
database = db
class Topic(BaseModel):
title = CharField() #标题
content = TextField(default="") #内容
id = BigIntegerField(primary_key=True) #帖子id
author = CharField() #帖子作者
create_time = DateTimeField() #帖子创建时间
answer_nums = IntegerField(default=0) # 帖子回复数量
click_nums = IntegerField(default=0) # 帖子查看数量
like_nums = IntegerField(default=0) #点赞数
score = IntegerField(default=0) # 赏分
status = CharField() #状态
last_answer_time = DateTimeField()
class Answer(BaseModel):
topic_id = BigIntegerField()
author = CharField()
content = TextField(default="")
create_time = DateTimeField()
like_nums = IntegerField(default=0)
class Author(BaseModel):
id = CharField(primary_key=True)
blog_nums = IntegerField(default=0) #博客数
resources_nums = IntegerField(default=0) #资源数
forum_nums = IntegerField(default=0) #论坛数
blink_nums = IntegerField(default=0) #Blink数
ask_nums = IntegerField(default=0) #问答数
collection_nums = IntegerField(default=0) #收藏数
special_column_nums = IntegerField(default=0) #专栏数
desc = TextField(null=True) #用户描述
following_nums = IntegerField(default=0) #关注数
if __name__ == "__main__":
db.create_tables([Topic,Answer,Author])
2、数据爬取
import re
import requests
from model import *
from urllib import parse
from scrapy import Selector
from datetime import datetime
domain = "https://bbs.csdn.net"
# 获取左边菜单js
def get_nodes_json():
left_menu_text = requests.get("https://bbs.csdn.net/dynamic_js/left_menu.js?csdn").text
nodes_str_match = re.search("forumNodes:(.*])", left_menu_text)
if nodes_str_match:
nodes_str = nodes_str_match.group(1).replace("null", "None")
# nodes_list = ast.literal_eval(node