此文只用作学习。
1.环境须知
做这个爬取的时候需要安装好python3.6、requests、re以及xlwt模块。request主要是用于爬取页面信息,xlwt主要用于excel的交互,将爬取数据保存至excel,如果需要将数据保存至MongoDB,则需要添加与MongoDB数据库交互的模块。
2.直接上代码
spider.py
import json
import re
import requests
import xlwt
#程序入口
if __name__ == "__main__":
#配置客户端信息,包括认证和登录信息
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
cookie = '_T_WM=0412b7373fef233f1fda1f4bed6f73d1; SUB=_2A252I2baDeRhGeNN61EW9yvPzTyIHXVV7AqSrDV6PUJbkdBeLW6mkW1NScphGSbvMaRIVDh46ifNydfllNkxlxDw; SUHB=0jBInbOCiHQ2AR; SCF=AsPAKsXRVR6cRsLVbdg7HDjHpygj27skkhNtmBltuLWROUm95BV0H6g6CCgxn_MFo0Ke8vyqzCz-kXPg-iQDq9Y.; MLOGIN=1; M_WEIBOCN_PARAMS=featurecode%3D20000320%26luicode%3D10000011%26lfid%3D106003type%253D1%26fid%3D100103type%253D1%2526q%253D000063%26uicode%3D10000011; WEIBOCN_FROM=1110006030'
headers = {
"User-Agent": user_agent,
"cookie": cookie
}
#设置excel交互信息
workbook = xlwt.Workbook(encoding='utf-8')
booksheet1 = workbook.add_sheet('Sheet1', cell_overwrite_ok=True)
booksheet2 = workbook.add_sheet('Sheet2', cell_overwrite_ok=True)
booksheet3 = workbook.add_sheet('Sheet3', cell_overwrite_ok=True)
list1 = []
list1.append(['id', 'mid','created_at','user_screen_name', 'user_verified_reason', 'weibo_text', 'comment_index','comment_number', 'repost_index','repost_number'])
list2 = []
list2.append(['comment_index','created_at','user_id','user_screen_name','user_text'])
list3 = []
list3.append(['repost_index','created_at', 'user_id', 'user_screen_name', 'user_text'])
#下面两个变量记