1.数据来源是全国公共资源交易网(http://deal.ggzy.gov.cn/ds/deal/dealList.jsp?HEADER_DEAL_TYPE=01)
2.获取(地区、标题、来源、开标记录、类型、中标结果等信息)抓取代码如下:
# 列表页解析
def parse(self, response):
# print(response)
item = BidItem()
# print(1111)
# print(response.text)
# time.sleep(3)
# 获得结果
res = json.loads(response.text)
# print(res["data"][0]["title"])
for i in res["data"][:10]:
# 标题
item["title"] = i["title"]
# 省份
item["province"] = i["districtShow"]
# 城市
item["city"] = "承德市"
# 发布日期
item["zb_date"] = i["timeShow"]
# 内容摘要
item["abstract"] = "暂无摘要"
# 详情页url
detail_url = i["url"]
bid_url = detail_url.replace("/a/", "/b/")
print(bid_url)
try:
yield scrapy.Request(
url=detail_url,
meta={"key_item": deepcopy(item), "bidurl": bid_url},
callback=self.parse_detail,
dont_filter=True
)
except Exception as e:
logger.error(e)
# 详情页数据
def parse_detail(self, response):
item = response.meta.get("key_item")
list1 = []
# with open("a.html", "wb") as f:
# f.write(response.body)
# 招标url
bid_url = response.meta.get("bidurl")
# print(bid_url)
# 拼接到列表中
list1.append(bid_url)
# 开标记录url
if "暂无" in response.xpath("//div[@id='div_0102']//li//text()").extract()[0]:
item["bid_opening_record"] = "暂无数据"
list1.append("2")
# print(11111)
else:
record_url = response.xpath("//div[@id='div_0102']//a/@onclick").extract()[0]
record_url = "http://www.ggzy.gov.cn/information" + record_url.split("','")[1][:-2]
res = requests.get(record_url)
item["bid_opening_record"] = res.text
# list1.append(record_url)
# print(record_url)
# 交易结果url
if "暂无" in response.xpath("//div[@id='div_0104']//li//text()").extract()[0]:
item["trade_result"] = "暂无数据"
list1.append("3")
# print(22222)
else:
res_url = response.xpath("//div[@id='div_0104']//a/@onclick").extract()[0]
res_url = "http://www.ggzy.gov.cn/information" + res_url.split("','")[1][:-2]
res = requests.get(res_url)
item["trade_result"] = res.text
list1.append(res_url)
# print(record_url)
# 招标/资审文件澄清url
if "暂无" in response.xpath("//div[@id='div_0105']//li//text()").extract()[0]:
item["bid_clear"] = "暂无数据"
list1.append("4")
# print(333)
else:
clear_url = response.xpath("//div[@id='div_0105']//a/@onclick").extract()[0]
clear_url = "http://www.ggzy.gov.cn/information" + clear_url.split("','")[1][:-2]
res = requests.get(clear_url)
# div = etree.tostring(res.text, encoding='utf-8', method="Html")
# print(type(div), 22222222)
item["bid_clear"] = res.text
# list1.append(clear_url)
# print(list1)
try:
yield scrapy.Request(
url=bid_url,
meta={"key_item": deepcopy(item),"list1": list1},
callback=self.bid_parse,
dont_filter=True
)
except Exception as e:
logger.error(e)
3.获取到的数据如下:
4.通过这些数据,利用django做一个自己的查询平台。
5.如有问题,请加qq:763073105