scrapy模拟登录github和51cto

最新推荐文章于 2024-11-05 15:32:33 发布

weixin_33744141

最新推荐文章于 2024-11-05 15:32:33 发布

阅读量127

点赞数

文章标签： python 开发工具操作系统

原文链接：http://blog.51cto.com/haoyonghui/2140888

版权

模拟登录github
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from scrapy import FormRequest


class Login1Spider(scrapy.Spider):
    name = 'login1'
    allowed_domains = ['github.com']
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Accept-Encoding': 'gzip, deflate, br',
        'Referer': 'https://github.com/',
        'Content-Type': 'application/x-www-form-urlencoded',
    }

    start_urls = ['https://github.com/758109577']
    def start_requests(self):
        urls = ['https://github.com/login']
        for url in urls:
            yield Request(url, meta={'cookiejar': 1}, callback=self.github_login)

    def github_login(self, response):
        # 首先获取authenticity_token,这里可以借助scrapy shell ”url“来获取页面
        # 然后从源码中获取到authenticity_token的值
        authenticity_token = response.xpath("//input[@name='authenticity_token']/@value").extract_first()
        self.logger.info('authenticity_token=' + authenticity_token)
        # url可以从fiddler抓取中获取,dont_click作用是如果是True，表单数据将被提交，而不需要单击任何元素。
        return FormRequest.from_response(response,
                                         url='https://github.com/session',
                                         meta={'cookiejar': response.meta['cookiejar']},
                                         headers=self.headers,
                                         formdata={'utf8': '✓',
                                                   'authenticity_token': authenticity_token,
                                                   'login': 'aaaaaa@qq.com',
                                                   'password': 'xxxxxx'},
                                         callback=self.github_after,
                                         dont_click=True,
                                         )

    def github_after(self, response):
        # 获取登录页面主页中的字符串'Browse activity'
        list = response.xpath("//a[@class='UnderlineNav-item selected']/text()").extract()
        # 如果含有字符串，则打印日志说明登录成功
        if 'Browse activity' in list:
            self.logger.info('我已经登录成功了，这是我获取的关键字：Browse activity')

        for url in self.start_urls:
            yield Request(url=url, callback=self.show)

    def show(self,response):
        print("############################")
        list = response.xpath("//span[@class='p-nickname vcard-username d-block']/text()").extract()
        if 'aaaaaa' in list:
            print(list)
            print("############################")
        else:
            print("失败")

2.模拟登录51cto

items配置
vim items.py
class CtoItem(scrapy.Item):
    title_url = scrapy.Field()
    title = scrapy.Field()
    fullname = scrapy.Field()
    
    
    
 vim login2.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import FormRequest,Request
from ..items import CtoItem

class Login2Spider(scrapy.Spider):
    name = 'login2'
    allowed_domains = ['51cto.com']
    #start_urls = ['http://51cto.com/']

    def start_requests(self):
        urls = ['http://home.51cto.com/index']
        for url in urls:
            yield Request(url,callback=self.cto_login,meta={'cookiejar':1})

    def cto_login(self,response):
        csrf = response.xpath("//input[@name='_csrf']/@value").extract_first()
        headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language':'zh-CN,zh;q=0.9',
            'Accept-Encoding':'gzip, deflate',
            'Referer':'http://www.51cto.com/',
            'Content-Type':'text/html; charset=UTF-8'
        }
        self.logger.info("获取csrf值为 %s" % csrf)
        yield FormRequest.from_response(response,url='https://blog.51cto.com/haoyonghui?type=1',headers=headers,meta={'cookiejar': response.meta['cookiejar']},formdata={
            'LoginForm[username]':'aaaaaaa@qq.com',
            'LoginForm[password]':'xxxxxx',
            'LoginForm[rememberMe]':'0',
            '_csrf': csrf,
            },callback=self.after_login,dont_click=True,
        )

    def after_login(self,response):
        item = CtoItem()
        #item = {}
        resps = response.css('ul.artical-list li')
        for resp in resps:
            # 写入item字段中
            item['title_url'] = resp.css("a.tit::attr(href)").extract_first()
            item['title'] = resp.css("a.tit::text").extract_first().strip()
            # fullname的格式为“[名称]（链接）”之所以这样是因为
            # markdown语法里这个表示链接的意思，点击名称直接打开链接内容
            item['fullname'] = '[' + item['title'] + ']' + '(' + item['title_url'] + ')'
            # 此处logger也是调试使用
            print("###################")
            self.logger.info("title url的值为：%s , title的值为%s" % (item['title_url'], item['title']))
            yield item

        # 下一页内容获取
        next_page = response.css('li.next a::attr(href)').extract_first()
        # self.logger.info("下一页链接为：%s" % next_page)
        if next_page is not None:
            yield Request(next_page, callback=self.after_login)

    #def parse(self, response):
        #pass

转载于:https://blog.51cto.com/haoyonghui/2140888