import traceback import requests import time from bs4 import BeautifulSoup #from write_excel import OpenpyxlExcel from write_excel import OprationExcel class RequestInfo(object): def __init__(self): self.url_root = "http://xkz.cbirc.gov.cn/ilicence/showLicenceInfo.do?state=3&id=" # 每条信息的key self.info_key_list = [] # 每个key的value self.info_values_list = [] # 将每行信息以字典形式存储的列表 json格式 self.info_list = [] # 存放生成的所有url self.url_list = [] # 所有流水号列表 self.lsh_values_list = [] # 所有机构名称 self.jgname_values_list = [] # 所有机构编码 self.jgcode_values_list = [] def generate_url(self, start, end): for i in range(start, end+1): url = self.url_root + "{:0>8}".format(i) self.url_list.append(url) yield url def parse_html(self, start, end): try: for url in self.generate_url(start, end): r = requests.get(url) self.html = r.text self.soup = BeautifulSoup(self.html, "html.parser") if not self.soup.find("title").string == "出错啦!": for tr in self.soup.find_all("tr", "a0"): if tr.find("td", {"align": "right"}): # 查找的标签中,筛选多条属性时,可用如下方法 # for i in soup.find_all("td", attrs={"align":"right", "height": "25"}): # 筛选出所有tr标签下所有的信息名 for td in tr.find("td", {"align": "right"}): info_key = td.string.split()[0].split(":")[0] # if info_key == "流水号": # self.lsh_key_list.append(info_key) # elif info_key == "机构名称": # self.jgname_key_list.append(info_key) # elif info_key == "机构编码": # self.jgcode_key_list.append(info_key) self.info_key_list.append(info_key) # 由于html中有空格、换行等空字符,这里要用next_siblings平行迭代并判断出非空,再输出 # 但是注意,有些信息的value可能是空的,这里要判断出value为空的key,并把value改为“--” # 平行遍历,筛选出信息值 for td in tr.find("td", {"align": "right"}).next_siblings: # 筛选出bs4中的tag类型,判断内容为空的值标记为“--” if type(td).__name__ == 'Tag' and len(td.string.split()) == 0: info_value = "--" self.info_value_list.append(info_value) # elif not (td.string is None) and (len(td.string.split()) > 0): elif td.string and (len(td.string.split()) > 0): info_value = (td.string.split()[0]) self.info_value_list.append(info_value) else: continue # 将每条信息的key和value以字典形式存在info_list列表中 for i in range(len(self.info_key_list)): self.info_list.append({self.info_key_list[i]: self.info_value_list[i]}) # for num in range(len(self.info_list)): # print(num, self.info_list[num]) else: print("未找到内容") continue yield info_key, info_value except: traceback.print_exc() if __name__ == "__main__": try: run1 = RequestInfo() list1 = run1.generate_url(1, 10) # print(list1) for num in list1: print(num) # excel1.excel_write() # excel1.excel_append(info_key_list, 0) # excel1.excel_append(info_value_list, 1) # excel1.excel_read() except: traceback.print_exc() # excel_1 = OpenpyxlExcel("test.xlsx") # excel_1.openpyxl_write(info_list) # excel1.excel_append(info_list[1]["机构名称"]) # excel1.excel_append(info_list[0]["机构编码"])