from bs4 import BeautifulSoup import urllib.request,urllib.error import ssl import re import xlwt def main(): baseurl = "http://ylbz.yn.gov.cn/index.php?s=form&c=ybypml&m=page" datalist=getDate(baseurl) print(datalist) savapath = "医保药品目录爬取.xls" saveData(datalist,savapath) finddata= re.compile(r'<td>(.*?)</td>') def askURL(url): head = { "User-Agent": "Mozilla / 5.0(Macintosh;IntelMacOSX10_15_7) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 100.0.4896.60Safari / 537.36" } #模拟浏览器访问 req = urllib.request.Request(url, headers=head) res = urllib.request.urlopen(req, context=ssl.SSLContext(ssl.PROTOCOL_SSLv23)) #加个证书 html = res.read().decode("utf-8") return html def getDate(baseurl): alldata=[] for i in range(1,11): url=baseurl+"&page="+str(i) html=askURL(url) soup=BeautifulSoup(html,"html.parser") #创建Beautifulsoup对象来分析文档 for item in soup.find_all("td"): item=str(item) data=re.findall(finddata,item) alldata.append(data) #爬取网站前十页内容 return alldata def saveData(datalist,savepath): #将数据保存到excel book = xlwt.Workbook(encoding="utf-8",style_compression=0) sheet=book.add_sheet('医保药品',cell_overwrite_ok=True) col =("药品编码","通用名","商品名","类别","规格","包装","剂型","厂家") for i in range(0,8): sheet.write(0,i,col[i])#列名 for j in range(0,250): for r in range(0,8): data=datalist[j*8+r] sheet.write(j+1,r,data) book.save(savepath) if __name__ == '__main__': main()
爬虫代码模版
最新推荐文章于 2024-08-02 10:29:40 发布