# -*-coding:utf-8-*- import json import os import sys import requests import xlrd import xlwt from xlutils.copy import copy def get_page(url): try: response = requests.get(url) if response.status_code == 200: return response except: return None def write_data(sheet, row, lst): for data_infos in lst: j = 0 for data in data_infos: sheet.write(row, j, data) j += 1 row += 1 def save(file_name, data): if os.path.exists(file_name): # 打开excel rb = xlrd.open_workbook(file_name, formatting_info=True) # 用 xlrd 提供的方法获得现在已有的行数 rn = rb.sheets()[0].nrows # 复制excel wb = copy(rb) # 从复制的excel文件中得到第一个sheet sheet = wb.get_sheet(0) # 向sheet中写入文件 write_data(sheet, rn, data) # 删除原先的文件 os.remove(file_name) # 保存 wb.save(file_name) else: header = ['biz_id', 'biz_name', 'cate_id', 'cate_name', 'catechild_id', 'catechild_name', 'province_id', 'province,city_id', 'city,biz_addr', 'biz_desc', 'start_level', 'life_id', 'life', 'biz_phone', 'biz_cmsg', 'url'] book = xlwt.Workbook(encoding='utf-8') sheet = book.add_sheet('建设银行卡-优惠商户活动数据') # 向 excel 中写入表头 for h in range(len(header)): sheet.write(0, h, header[h]) # 向sheet中写入内容 write_data(sheet, 1, data) book.save(file_name) def main(): base_url = 'http://creditcard.ccb.com/cn/creditcard/creditFavarite.html#card_province={prov_code}&card_city={city_code}' cities_js = 'http://creditcard.ccb.com/cn/creditcard/v3/js/citys.js' citis_resp = get_page(cities_js) citis_resp = citis_resp.content.decode('utf-8').split('=')[1] # print(citis_resp) citis_json = json.loads(citis_resp) # print(citis_json) path = (input('请输入要保存的地址(例如:C:\\Users\\xhdong1\\Desktop\\),不输入则保存到当前地址:\n')) file_name = path + '建设银行卡-优惠商户活动数据.xls' print(file_name) province_total = len(citis_json) for dictinct in citis_json: # print(dictinct) prov_code = dictinct['prov_code'] prov_name = dictinct['prov_name'] # print(prov_code) # print(prov_name) cities = dictinct['citys'] city_total = len(cities) for city in cities: all_company_info_list = [] city_code = city['city_code'] city_name = city['city_name'] print('总过有【{province_total}】个城市,现在正在爬取【{prov_name}】,该城市一共有【{city_total}】个城市,现在正在爬虫【{city}】的数据'.format( province_total=province_total, prov_name=prov_name, city_total=city_total, city=city_name)) # print(city_code) # print(city_name) url = 'http://creditcard.ccb.com/webtran/get_crd_info.gsp?table_type=2&card_province={prov_code}&card_city={city_code}&startNum=1&endNum=1000000'.format( prov_code=prov_code, city_code=city_code) # print(_url) companies_list = get_page(url) companies_list = companies_list.content.decode('utf-8') # print(companies_list) try: companies_json = json.loads(companies_list) except: continue if companies_json: # print(companies_json) companies = companies_json['obj'] else: continue # print(companies) for company in companies: biz_id = company.get('biz_id') biz_name = company.get('biz_name') cate_id = company.get('cate_id') cate_name = company.get('cate_name') catechild_id = company.get('catechild_id') catechild_name = company.get('catechild_name') province_id = company.get('province_id') province = company.get('province') city_id = company.get('city_id') city = company.get('city') biz_addr = company.get('biz_addr') biz_desc = company.get('biz_desc') start_level = company.get('start_level') life_id = company.get('life_id') life = company.get('life') biz_phone = company.get('biz_phone') biz_cmsg = company.get('biz_cmsg') url = 'http://creditcard.ccb.com/cn/creditcard/favorable/' + biz_id + '.html' company_info = [biz_id, biz_name, cate_id, cate_name, catechild_id, catechild_name, province_id, province, city_id, city, biz_addr, biz_desc, start_level, life_id, life, biz_phone, biz_cmsg, url] # for info in company_info: # print(info) all_company_info_list.append(company_info) # print('--------------') save(file_name, all_company_info_list) print('爬完') if __name__ == '__main__': main()
待搞明白 银行数据爬取
最新推荐文章于 2023-12-19 13:03:25 发布