class ChongqingSpider(scrapy.Spider):
name = 'chongqing'
start_url = 'http://www.cqjsxx.com/webcqjg/GcxxFolder/jgysba_list.aspx'
# custom_settings = {
# 'DOWNLOADER_MIDDLEWARES': {
# 'zb_yitihua.middlewares.ProxyMiddleware': 543,
# }
# }
post_data = {'__EVENTTARGET': 'Pager1:LB_Next',
'__EVENTARGUMENT': '',
'__VIEWSTATE': '',
'SearchName': '',
'SearchNo': ''
}
def start_requests(self):
yield scrapy.Request(url=self.start_url)
def parse(self, response):
# print(response.text)
try:
total = response.xpath('//*[@id="Pager1_RCount"]/text()').extract()[0].strip()
total_page = response.xpath('//*[@id="Pager1_Pages"]/text()').extract()[0].strip()
now_page = response.xpath('//*[@id="Pager1_CPage"]/text()').extract()[0].strip()
except Exception:
print(111)
return
if int(now_page) == 1:
print('============= {0} 总共有 {1} 条数据 ==================='.format('重庆市工程信息网', str(total)))
# 解析当前页信息
dgData = response.xpath('//table[@id="dgFileNotice"]//tr')
for data in dgData[1:]:
href = data.xpath('.//td[1]/font/a/@href').extract()[0].strip()
# urls = urljoin(self.start_url, href)
url = re.findall('''javascript:__doPostBack\('(.*?)',''\)''', href, re.S)[0].replace('$', ':')
datas = {'__EVENTTARGET': url,
'__EVENTARGUMENT': '',
'__VIEWSTATE': self.post_data['__VIEWSTATE'],
'SearchName': '',
'SearchNo': ''
}
yield scrapy.FormRequest(url=self.start_url, dont_filter=True, callback=self.content_parse, formdata=datas)
if int(now_page) < int(total_page):
# 准备下一页的数据
# print(now_page)
VIEWSTATE = response.xpath('//*[@name="__VIEWSTATE"]/@value').extract()[0].strip()
post_data = self.post_data
post_data['__VIEWSTATE'] = VIEWSTATE
# post_data['checkPage'] = str(now_page)
url = response.url
# print(self.post_data)
yield scrapy.FormRequest(url=url, formdata=post_data, dont_filter=True, callback=self.parse)
else:
print('全部采集完成++++++++++++')
return
def content_parse(self, response):
CheckID = re.findall("\('Jgysba_List_View.aspx\?CheckID=(.*?)'", response.text, re.S)[0]
url = 'http://www.cqjsxx.com/webcqjg/GcxxFolder/Jgysba_List_View.aspx?CheckID=' + str(CheckID)
javascript:__doPostBack(‘dgFileNotice$_ctl11$lbtnCheck‘
最新推荐文章于 2023-07-06 09:55:20 发布