需求
- 抓取微信公众号历史发表的文章
- URL去重
- 如果某页的列表中某篇文章已经抓取,则不继续翻页
思路
大体思路借鉴Python抓取微信公众号全部文章,
- 微信扫码登陆获取cookies,token,
- 发送request请求得到fakeid(公众号)
- 得到请求页数的参数
- 得到微信公众号的文章链接
- 把上述链接导入scrapy进行抓取文章内容
Scapy文件结构
代码
import os
import re
import time
from fake_useragent import UserAgent
from selenium.webdriver import Firefox
import requests as req
import pymongo
import random
import json
# from weixin.model import WeiXinData,db_connect,create_table
from sqlalchemy.orm import sessionmaker
import urllib3
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from scrapy.utils.project import get_project_settings
from scrapy import Request
from selenium.webdriver.common.proxy import Proxy, ProxyType
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def set_cookies(cookie_file):
'''get cookie
'''
with webdriver.Firefox() as driver:
# res = SeleniumRequest(url=self.URL)
driver.get(URL)
time.sleep(120)
cookie_items = driver.get_cookies()
post_cookie = {}
for cookie_item in cookie_items:
post_cookie[cookie_item['name']] = cookie_item['value']
cookie_str = json.dumps(post_cookie)
with open(cookie_file, 'w', encoding='utf-8') as f:
f.write(cookie_str)
print(driver.current_url)
return post_cookie
def get_token(cookie_file):
'''get token from cookie'''
with open(cookie_file, 'r', encoding='utf-8') as f:
cookie = f.read()
cookies = json.loads(cookie)
res = req.get(url=URL