整体代码如下:
import requests
import lxml.html
import chardet
import pymongo
def mongodb(content_list):
connection = pymongo.MongoClient()
db = connection.Disease_classification
collection = db.disease
collection.insert(content_list)
def page_until_dict(source):
selector = lxml.html.fromstring(source)
code_list = selector.xpath('//div[@class="main"]/div[@class="offset-top table-list"]/table[@class="table table-striped"]/tbody/tr/th/text()')
name_list = selector.xpath('//div[@class="main"]/div[@class="offset-top table-list"]/table[@class="table table-striped"]/tbody/tr/td[2]/text()')
english_name_list = selector.xpath('//div[@class="main"]/div[@class="offset-top table-list"]/table[@class="table table-striped"]/tbody/tr/td[3]/text()')
content_list = []
for i in range(len(code_list)):
content_list.append({'疾病编码':code_list[i],'疾病名称':name_list[i],'疾病名称(英文)':english_name_list[i]})
mongodb(content_list)
def main():
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'}
url_list = ["https://db.yaozh.com/icd?p={}&pageSize=30".format(str(i)) for i in range(1,8)]
for url in range(len(url_list)):
r = requests.get(url_list[url],headers=headers)
r.encoding = chardet.detect(r.content)['encoding']
source = r.text
page_until_dict(source)
if __name__ == '__main__':
main()
最终存入数据库如下: