爬虫思路
通过搜狗微信的搜索功能实现输入关键字搜索并实现自动化翻页爬取100页搜索内容(搜狗搜索内容最多显示100页即1000条信息),保存的数据如下:
title 文章标题
weixin_name 发布人名称
Time 发布问题的时间
content 内容简介
url 文章链接
知识点
- Selenium 模拟登陆并使用cookies访问
- scrapy框架的基本使用
- 使用打码平台自动输入验证码
前期准备
安装库
pip install selenium 浏览器自动化框架
pip install fake_useragent 随机请求头
pip install scrapy 爬虫框架
scrapy startproject lu_wechatSogou 创建爬虫项目
cd ./weixinSougou
scrapy genspider SougouwechatLu.py https://weixin.sogou.com/ 创建爬虫
正文
# -*- coding: utf-8 -*-
import scrapy
import datetime
import json
from lu_wechatSogou.items import AccountItem,ArticleItem
class SougouwechatLuSpider(scrapy.Spider):
name = 'sougouWechat_lu'
def __init__(self,keyword=None,tableName=None):
self.keyword=keyword #搜索内容
self.cookies_file_path = 'cookies.json' #cookies文件相对路径
pass
# 文章链接
page_article_url = 'https://weixin.sogou.com/weixin?query={word}&_sug_type_=&s_from=input&_sug_=n&type=2&page={page}&ie=utf8'
# 公众号链接
page_account_url = 'https://weixin.sogou.com/weixin?query={word}&_sug_type_=&s_from=input&_sug_=n&type=1&page={page}&ie=utf8'
page = 1 #页数
def start_requests(self):
try:#有cookies文件
# 设置新的cookie
with open(self.cookies_file_path, 'r') as f:
listCookie = json.loads(f.read())
# cookies格式转换
cookies = {}
for cookie in listCookie:
cookies[cookie['name']] = cookie['value']
# 文章类爬取链接
print("带着有登录状态的cookies去访问")
yield scrapy.Request(url=self.page_article_url.format(word=self.keyword, page=1), callback=self.article_parse,
meta={'page': 1, 'word': self.keyword, 'retry_times': True}, dont_filter=True,cookies=cookies)
# 公众号账号爬取链接
# yield scrapy.Request(url=self.page_account_url.format(word=word,page=page),callback=self.account_parse,
# meta={'page':page,'word':word,'retry_times':True},dont_filter=True,cookies=cookies)
except FileNotFoundError:#没有cookies
# 文章类爬取链接
yield scrapy.Request(url=self.page_article_url.format(word=self.keyword, page=1),
callback=self.article_parse,
meta={'page': 1, 'word': self.keyword, 'retry_times': True}, dont_filter=True,
)
# 公众号账号爬取链接
# yield scrapy.Request(url=self.page_account_url.format(word=word,page=page),callback=self.account_parse,
# meta={'page':page,'word':word,'retry_times':True},dont_filter=True)
'''
文章抓取parse
'''
def article_parse(self, response):
print("当前为第{}页,状态码为{}".format(self.page,response.status))
cookies = response.request.cookies
# print(response.text)
word = response.meta.get('word') # 获取提交的内容
page = response.meta.get('page')
infos = response.xpath('//ul[@class="news-list"]//li//div[@class="txt-box"]')
# item存储方式
for info in infos:
item = ArticleItem()
item['page'] = "第{}页".format(page)
item['title'] = ''.join(info.xpath('./h3/a//text()').extract())
item['weixin_name'] = info.xpath('.//a[@class="account"]/text()').extract_first()
time = info.xpath('.//span[@class="s2"]//text()').re_first('document.write\(timeConvert\(\'(.*?)\'\)\)')
if time: # 将获取到的时间戳转换为datetime
d_time = datetime.datetime.fromtimestamp(int(time))
s_time = d_time.strftime("%Y-%m-%d %H:%M:%S") # 年-月-日 时-分-秒
item['time'] = s_time if time else None
item['content'] = ''.join(info.xpath('.//p[@class="txt-info"]//text()').extract())
item['url'] = info.xpath('./h3/a/@data-share').extract_first()
yield item
#翻页
if self.page < 100 :
self.page += 1
proxy = response.meta.get('proxy')
yield scrapy.Request(url=self.page_article_url.format(word=self.keyword, page=self.page), callback=self.article_parse,
cookies=cookies,meta={'page': self.page, 'word': self.keyword, 'proxy':proxy}, dont_filter=True)
'''
公众号抓取parse
'''
def account_parse(self, response):
pass
- middleware.py:添加header请求头中间件、验证码中间件
import json
import time
import requests
from scrapy import signals
from fake_useragent import UserAgent #获取众多浏览器版本
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
'''
header中间件
'''
User_Agent = UserAgent().firefox #随机用不同的浏览器UserAgent
class UserAgentMiddleware(object):
def __init__(self):
#微信登录扫码界面
self.Referer = 'https://open.weixin.qq.com/connect/qrconnect?appid=wx6634d697e8cc0a29&scope=snsapi_login&response_type=code&redirect_uri=https%3A%2F%2Faccount.sogou.com%2Fconnect%2Fcallback%2Fweixin&state=616e9ff5-2b7d-439b-9b49-ebf307f6aa56&href=https%3A%2F%2Fdlweb.sogoucdn.com%2Fweixin%2Fcss%2Fweixin_join.min.css%3Fv%3D20170315'
self.Host = 'weixin.sogou.com'
self.Connection = 'keep-alive'
self.Accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3'
def process_request(self,spider,request):
print('请求头using headers!')
request.headers['User-Agent'] = User_Agent
request.headers['Referer'] = self.Referer
request.headers['Host'] = self.Host
request.headers['Connection'] = self.Connection
request.headers['Accept'] = self.Accept
request.headers['Upgrade-Insecure-Requests'] = 1
request.headers['Accept-Encoding'] = 'gzip, deflate, br'
request.headers['Accept-Language'] = 'zh-CN,zh;q=0.9'
"""
验证码中间件
遇到验证码时调用selenium进行打码操作
此处采用人工打码,如果采集数量大的情况建议使用打码平台Api
"""
class CodeMiddleware(object):
def __init__(self):
self.new_cookies = {}
self.login_url = 'https://weixin.sogou.com'
self.cookies_file_path = 'cookies.json'
def get_browser_cookies(self):
'''
从本地文件读取cookies,并转换成scrapy.Request的cookies格式
'''
with open(self.cookies_file_path,'r') as f:
listCookie = json.loads(f.read())
#cookies格式转换
cookies = {}
for cookie in listCookie:
cookies[cookie['name']] = cookie['value']
return cookies
def process_request(self,request,spider):
'''
检测Reques对象有没有设置cookies
如果没有,则调用firefox进行登陆操作,并写入cookies到本地
'''
print("cookie{}".format(request.cookies))
if not request.cookies:
driver = webdriver.Firefox()
wait = WebDriverWait(driver, 10)
driver.get(self.login_url) # 登录
login = wait.until(EC.element_to_be_clickable((By.ID, 'loginBtn'))) # 定位到登录处
login.click() # 跳转
input('请完成登录后点击回车')
cookies = driver.get_cookies()
#写入本地cookies
with open(self.cookies_file_path,'w') as f:
f.write(json.dumps(cookies))
driver.close()
request.cookies=self.get_browser_cookies()
def rkClient(self,image): #使用若快打码实现自动输入验证码
verify_code = '****'
base_params = {
'typeid': 3060,
'timeout': 60,
'username': 'xxx', # 用户名
'password': 'xxx', # 密码
'softid': 'xxxx', # 软件Id
'softkey': 'xxxxxx' # 软件Key
}
headers = {
'Connection': 'Keep-Alive',
'Expect': '100-continue',
'User-Agent': 'ben',
}
files = {'image': ('a.jpg', image)}
try:
resp = requests.post('http://api.ruokuai.com/create.json', data=base_params, files=files, headers=headers)
except Exception as e:
print('get_verify_code error: ', e)
return verify_code
try:
verify_code = resp.json().get('Result', '')
except Exception as e:
print('get_verify_code failed: ', e)
return verify_code
if not verify_code:
try:
print(resp.text)
except:
print('verify code resp is None')
return verify_code
def process_response(self, request, response, spider):
'''
重定向处理,response状态码为302
情况一:调用Firefox访问重定向页面为验证码页面,则输入验证码,获取新cookies,并返回带有新cookies值的Request
情况二:调用Firefox访问重定向页面为正常页面,则保存新cookies,并返回带有新cookies值的Request
一般微信的反爬为第一次重定向页面为第二种情况,后续为情况一
'''
if response.status == 302:
print("当前网址为:{},302,被重定向了!!!!!!!".format(response.url))
print("请修改下cookies,以此退出302的状态")
#设置代理浏览器
firefoxOptions = webdriver.FirefoxOptions()
firefoxOptions.add_argument('user-agent={}'.format(User_Agent))
browser = webdriver.Firefox(firefox_options=firefoxOptions)
browser.set_window_size(1200,800)
browser.get('https://weixin.sogou.com')
browser.delete_all_cookies() # 删除所有cookies
time.sleep(3)
#重新设置cookie
with open(self.cookies_file_path,'r') as f:
listCookie = json.loads(f.read())
time.sleep(1)
for cookie in listCookie:
browser.add_cookie(cookie)
if not browser.get(response.url):
print("进入是否为验证码页面的判断")
# 进行页面判定,如果不是not Found页面,则进行后续操作
try:
test = browser.find_element_by_xpath('//div[@id="main-message"]/h1/span').text
except:
test = False
if test:
print("当前页面为not Found")
browser.close()
return request
# 获取验证码文本框
wait = WebDriverWait(browser, 15)
print("搜索验证码文本框")
try:
input_text = wait.until(EC.presence_of_element_located((By.ID,'seccodeInput')))#定位到输入验证码框
except:
input_text = None
#判断是情况一还是情况二,如果情况一则直接返回带有新cookies值的Request
if input_text:#情况一
print("当前为验证码页面,开始处理")
# 获取提交验证码的button
try:
button = wait.until(EC.element_to_be_clickable((By.ID, 'submit'))) # 定位到提交验证码的按钮
except:
print("没有找到按钮")
#获取验证码页面截图
browser.save_screenshot('CaptchaPage.png')
#通过Image处理图像,从而保存验证码图片
im = Image.open('CaptchaPage.png')
im = im.crop((572, 343, 700, 395))
im.save('code.png')
image = open('code.png', 'rb').read()
code = str(self.rkClient(image))
input_text.clear()
time.sleep(2)
if not input_text.send_keys(code):
time.sleep(1)
#点击确定提交验证码
if not button.click():
print("已输入验证码,保存新的cookies")
time.sleep(2)
#设置新的cookie
new_listCookie = browser.get_cookies()
with open(self.cookies_file_path,'w') as f:
f.write(json.dumps(new_listCookie))
for cookie in new_listCookie:
self.new_cookies[cookie['name']] = cookie['value']
else:#情况二
print("当前为情况二不需要输入验证码")
new_listCookie = browser.get_cookies()
for cookie in new_listCookie:
self.new_cookies[cookie['name']] = cookie['value']
browser.close()
request.cookies = self.new_cookies
print("浏览器cookie: {}".format(self.new_cookies))
return request
else:
print("[+]200 Continue .....")
return response
# -*- coding: utf-8 -*-
BOT_NAME = 'lu_wechatSogou'
SPIDER_MODULES = ['lu_wechatSogou.spiders']
NEWSPIDER_MODULE = 'lu_wechatSogou.spiders'
DOWNLOAD_TIMEOUT = 60 #下载超时设置(单位秒)
DOWNLOAD_DELAY = 2
RANDOMIZE_DOWNLOAD_DELAY = True
COOKIES_ENABLED = True
DOWNLOADER_MIDDLEWARES = {
'lu_wechatSogou.middlewares.UserAgentMiddleware': 544,
'lu_wechatSogou.middlewares.CodeMiddleware': 812,
}