javascript:__doPostBack(‘dgFileNotice$_ctl11$lbtnCheck‘

class ChongqingSpider(scrapy.Spider):
    name = 'chongqing'
    start_url = 'http://www.cqjsxx.com/webcqjg/GcxxFolder/jgysba_list.aspx'

    # custom_settings = {
    #     'DOWNLOADER_MIDDLEWARES': {
    #         'zb_yitihua.middlewares.ProxyMiddleware': 543,
    #     }
    # }
    post_data = {'__EVENTTARGET': 'Pager1:LB_Next',
                 '__EVENTARGUMENT': '',
                 '__VIEWSTATE': '',
                 'SearchName': '',
                 'SearchNo': ''
                 }

    def start_requests(self):
        yield scrapy.Request(url=self.start_url)

    def parse(self, response):
        # print(response.text)
        try:
            total = response.xpath('//*[@id="Pager1_RCount"]/text()').extract()[0].strip()
            total_page = response.xpath('//*[@id="Pager1_Pages"]/text()').extract()[0].strip()
            now_page = response.xpath('//*[@id="Pager1_CPage"]/text()').extract()[0].strip()
        except Exception:
            print(111)
            return
        if int(now_page) == 1:
            print('============= {0} 总共有 {1} 条数据 ==================='.format('重庆市工程信息网', str(total)))
        # 解析当前页信息
        dgData = response.xpath('//table[@id="dgFileNotice"]//tr')
        for data in dgData[1:]:
            href = data.xpath('.//td[1]/font/a/@href').extract()[0].strip()
            # urls = urljoin(self.start_url, href)
            url = re.findall('''javascript:__doPostBack\('(.*?)',''\)''', href, re.S)[0].replace('$', ':')
            datas = {'__EVENTTARGET': url,
                     '__EVENTARGUMENT': '',
                     '__VIEWSTATE': self.post_data['__VIEWSTATE'],
                     'SearchName': '',
                     'SearchNo': ''
                     }
            yield scrapy.FormRequest(url=self.start_url, dont_filter=True, callback=self.content_parse, formdata=datas)
        if int(now_page) < int(total_page):
            # 准备下一页的数据
            # print(now_page)
            VIEWSTATE = response.xpath('//*[@name="__VIEWSTATE"]/@value').extract()[0].strip()
            post_data = self.post_data
            post_data['__VIEWSTATE'] = VIEWSTATE
            # post_data['checkPage'] = str(now_page)
            url = response.url
            # print(self.post_data)
            yield scrapy.FormRequest(url=url, formdata=post_data, dont_filter=True, callback=self.parse)

        else:
            print('全部采集完成++++++++++++')
            return

    def content_parse(self, response):
        CheckID = re.findall("\('Jgysba_List_View.aspx\?CheckID=(.*?)'", response.text, re.S)[0]
        url = 'http://www.cqjsxx.com/webcqjg/GcxxFolder/Jgysba_List_View.aspx?CheckID=' + str(CheckID)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值