这里登录抽屉并对所有帖子点赞,账号密码就不透漏了,哈
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http.cookies import CookieJar
from scrapy.http import Request
from scrapy.selector import Selector
class ChoutioSpider(scrapy.Spider):
name = 'choutio'
allowed_domains = ['chouti.com']
start_urls = ['https://dig.chouti.com/']
#为了防止IP被封,自定义浏览器Headers
custom_settings = {
'DEFAULT_REQUEST_HEADERS': {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
}
cookie={}
def parse(self, response):
cookie_obj = CookieJar()
cookie_obj.extract_cookies(response=response,request=response.request)
print(cookie_obj._cookies)
self.cookie = cookie_obj._cookies
#登录抽屉
yield Request(
url='https://dig.chouti.com/login',
method='POST',
body='phone=86150xxxxxxx&password=xxxxxx&oneMonth=1',
headers={'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'},
cookies=cookie_obj._cookies,
callback=self.loginsuccess
)
def loginsuccess(self,response):
#打印,如果code:999 表示登录成功
print(response.text)
#访问抽屉首页
yield Request(url="https://dig.chouti.com/",callback=self.praise)
def praise(self,response):
#获取到点赞url列表
id_list = Selector(response).xpath("//div[@share-linkid]/@share-linkid").extract()
for id in id_list:
url = "https://dig.chouti.com/link/vote?linksId=%s"%id
print(url,"oooooooooooooooooo")
#访问点赞url,需要携带cookie
yield Request(
url=url,
method='POST',
cookies=self.cookie,
#判断是否点赞成功
callback=self.confirm_praise,
)
page_urls = Selector(response).xpath("//a[@class='ct_pagepa']/@href").extract()
for item in page_urls:
page_url = "https://dig.chouti.com%s"%item
print(page_url,"xxxxxxxxxxxx")
yield Request(
url=page_url,
callback=self.praise,
)
def confirm_praise(self,response):
print(response.text)
settings.py 我这里递归深度设置5,看自己情况设置
DEPTH_LIMIT=5 #递归深度 0表示不设深度,无线循环
DUPEFILTER_CLASS = 'spider1.duplication.RepeatFilter'
自定义RepeatFilter类,并在settings.py中配置,文件名duplication.py
这个主要是重写BseDupeFilter类,作用是检查url是否已经访问过. 源码地址:
from scrapy.dupefilters import RFPDupeFilter
class BaseDupeFilter(object):
from __future__ import print_function
import os
import logging
from scrapy.utils.job import job_dir
from scrapy.utils.request import request_fingerprint
#判断url是否重复
#自定义类
class RepeatFilter(object):
def __init__(self):
self.url_set = set()
@classmethod
def from_settings(cls, settings):
return cls()
def request_seen(self, request):
#检查url是否已经爬取过
if request.url in self.url_set:
return True
self.url_set.add(request.url)
return False
def open(self): # can return deferred
print("open xxx")
pass
def close(self, reason): # can return a deferred
print("close xxx")
pass
def log(self, request, spider): # log that a request has been filtered
pass
点赞效果: