import requests, re, datetime from bs4 import BeautifulSoup #解析网页数据 import xlwt # 写入excle import random # 初始化 def init(): global url, province_name, headers # 爬取文件地址 url = 'https://www.yixue.com/' # [] list 容器 # 存储各省名称 province_name = [ '北京市', '天津市', '河北省', '山西省', '辽宁省', '吉林省', '黑龙江省', '上海市', '江苏省', '浙江省', '安徽省', '福建省', '江西省', '山东省', '河南省', '湖北省', '湖南省', '广东省', '内蒙古自治区', '广西壮族自治区', '海南省', '重庆市', '四川省', '贵州省', '云南省', '西藏自治区', '陕西省', '甘肃省', '青海省', '宁夏回族自治区', '新疆维吾尔自治区' ] headers = { # 浏览器信息 # "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36", "user-agent": random_user_agent(), # 从哪里来 "Referer": "https://www.yixue.com/", # 用户信息 "cookie": "newstatisticUUID=1651282597_602532090; _csrfToken=rmG7zCbbkD7QK34BymosH69Xve6eibDLGzeI2q8q; pageOps=1; fu=1403875312; qdrs=0|3|0|0|1; showSectionCommentGuide=1; qdgd=1; lrbc=1033272309|702228691|0; rcr=1033272309; bc=1033272309; _gid=GA1.2.364214729.1651282600; readadclose=1; _gat_gtag_UA_199934072_2=1; _ga_FZMMH98S83=GS1.1.1651282598.1.1.1651282753.0; _ga_PFYW0QLV3P=GS1.1.1651282598.1.1.1651282753.0; _ga=GA1.2.279552178.1651282599" } #随机取数 def random_user_agent(): ulist =["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36","Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36 Edg/101.0.1210.32"] # 随机从列表中取值 #print(random.choice(ulist)) #return random.choice(ulist) #0,列表长度中取值 return ulist[random.randint(0,len(ulist)-1)] #存储数据 def sav_message(messgae, province_now): # 创建一个excel表格用来写入信息 workbook = xlwt.Workbook(encoding='utf-8') name = province_now + "医院列表" # 添加工作表 table = workbook.add_sheet(name) value = ["医院名称", "医院地址", "联系电话", "医院等级", "重点科室", "经营方式", "传真号码", "电子邮箱", "医院网站"] # 添加表头信息 for i in range(len(value)): # print(value[i]) # 行,列,值 写入 table.write(0, i, value[i]) i = 0 # for data in messgae[3]: 获取医院信息位置 for data in messgae[4]: # print(data) if len(data) <= 1: continue else: try: # 写入医院名称 # print(data.b.a.text) hospltal = data.b.a.text table.write(i + 1, 0, data.b.a.text) for data_1 in data.ul: # print(data.ul) # print(data_1) now_mess = data_1.text.replace('\n', '') now_data = now_mess.split(':') # print(now_data) count = 0 for tittle in value: if tittle == now_data[0]: index = count #print(now_data[1]) table.write(i + 1, index, now_data[1]) count = count + 1 i = i + 1 #print(i) except: print(data.b.text + "这家医院爬取失败") # 保存一个excle数据表格 workbook.save('医院信息爬虫数据/分省/' + province_now + '医院列表.xls') # 获取当前省数据 def get_province_hospital(province_now): # 访问链接 r = requests.get(url + province_now + '医院列表', headers=headers, timeout=10) # print(r.text) # 数据分析 # 加载 设置解析器 soup = BeautifulSoup(r.text, "lxml") # 获取页面内容 message = soup.find_all('ul') # print(ul_content) sav_message(message, province_now) #主函数 if __name__ == '__main__': init() # 循环访问链接获取信息 for province in province_name: # print(province) get_province_hospital(province)
一键爬取全国所有医院数据,可以使用,仅供学习
最新推荐文章于 2024-07-03 20:47:12 发布