1. scrapy模拟登陆
我创建了一个scrapy项目,目的是获取github的cookie,然后对个人页面进行请求。此处创建3个爬虫文件,用3种方法分别实现。项目目录如图:
1.1 抓包获取cookie
适用于cookie生存时间较长的一些网站,抓包什么的你们都会,直接上爬虫代码
需要登录才能爬取的网站,需要重写start_request方法
爬取的文件保存到git.html
1.2 发送post请求
- 找到post的url地址:点击登录按钮进行抓包,然后定位url地址为https://github.com/session
- 找到请求体的规律:分析post请求的请求体,其中包含的参数均在前一次的响应中
import scrapy
class Git2Spider(scrapy.Spider):
name = 'git2'
allowed_domains = ['github.com']
start_urls = ['https://github.com/login']
# 通过页面分析,获取发送post请求的url,然后获取post请求的postdata
def parse(self, response):
authenticity_token = response.xpath('//*[@id="login"]/div[4]/form/input[1]/@value').extract_first()
print(authenticity_token)
post_data = {
'commit': 'Sign in',
'authenticity_token': authenticity_token,
'login': '2020224062',
'password': 'gss813579',
}
yield scrapy.FormRequest(
url="https://github.com/session",
formdata=post_data,
callback=self.after_login
)
def after_login(self, response):
yield scrapy.Request('https://github.com/exile-morganna', callback=self.check_login)
def check_login(self, response):
with open('git2.html', 'w', encoding='utf-8')as f:
f.write(response.text)
1.3 基于中间件使用selenium
驱动文件的配置放在pycharm的scripts路径,就不用添加环境变量了,可以百度,不会写selenium自动化语句,可以进行录制
主要是中间价的编写:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.chrome.options import Options
class TestGit2:
def setup_method(self):
# chrome_options = Options()
# chrome_options.add_argument('--headless')
# self.driver = webdriver.Chrome(chrome_options=chrome_options,
# executable_path='D:\project_Practice\git\git\chromedriver.exe')
self.driver = webdriver.Chrome()
self.vars = {}
def teardown_method(self):
self.driver.quit()
def test_git2(self):
self.driver.get("https://github.com/login")
self.driver.set_window_size(1225, 1039)
self.driver.find_element(By.ID, "login_field").click()
self.driver.find_element(By.ID, "login_field").send_keys("2020224062")
self.driver.find_element(By.ID, "password").click()
self.driver.find_element(By.ID, "password").send_keys("gss813579")
self.driver.find_element(By.NAME, "commit").click()
cookies_dict = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()}
return cookies_dict
class LoginDownloaderMiddleware(object):
def process_request(self, request, spider):
if spider.name == 'git3':
test = TestGit2()
test.setup_method()
cookie = test.test_git2()
time.sleep(2)
print(cookie)
test.teardown_method()
request.cookies = cookie
#如果爬虫文件是git3,拿到cookie后将请求的cookies替换
可以设置成无头,第三种方法比较万能,但是处理速度比较慢