最近吃了一些去湿热的中药,老医生写的药名潦草凌乱,一个药名两个字可能只能猜出来半个,所以就想着把所有中药名检索下来,去里面匹配,把药方读明白。
药名来自:AI医学百科:中药图典,包含2800个中药及其别名。
首先需要分析网页工作流,确定方案
中药图典的2800味药分布在28页上,每页100味。可以分别获取28个页面,之后在本地用 beautifulsoup 解析。
爬取28个网页,保存文本到本地
import json
import requests
import time
import random
from bs4 import BeautifulSoup
import tqdm
def crawl_pages():
headers = {
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
# "Cache-Control": "max-age=0",
"Host": "www.a-hospital.com",
# "If-Modified-Since": "Fri, 19 Oct 2018 05:14:56 GMT",
"Proxy-Connection": "keep-alive",
"Referer": "https://cn.bing.com/",
"Upgrade-Insecure-Requests": "1",
}
# 请求参数
root_url = "http://www.a-hospital.com/w/%E4%B8%AD%E8%8D%AF%E5%9B%BE%E5%85%B8/" # 中药图典
for i in range(4, 29):
# 随机暂停几秒,避免过快的请求导致过快的被查到
time.sleep(random.randint(0, 3))
if i == 1:
url = root_url
else:
headers["Referer"] = f"{root_url}{str(i - 1)}"
url = f"{root_url}{str(i)}"
resp = requests.get(url, headers=headers, verify=True)
if resp.status_code == 200:
print(f"page {i} saved!")
content = resp.text
with open(f"page{i}.html", "w+", encoding="utf-8") as file:
file.write(content)
else:
print(f"page {i} code {resp.status_code}!")
解析保存
解析页面,取出药名与别称,保存到字典,并另存到本地 json 文件,注意保持中文编码 ensure_ascii=False。
import json
import requests
import time
import random
from bs4 import BeautifulSoup
import tqdm
def parse_medicine():
medicine_alia = {}
for p in range(1, 29):
with open(f"page{p}.html", "r", encoding='utf-8') as file:
text = file.read()
soup = BeautifulSoup(text, "html")
for idx, item in enumerate(soup.select("div#masonry-container > table.wikitable > tr:last-child > td")):
# print(idx)
medicine = item.select("td > b > a")
alia = item.select("td > span")
if len(alia) > 0:
medicine_alia[medicine[0].text] = alia[0].text.split("、")
else:
medicine_alia[medicine[0].text] = []
print(f"page {p} read! Current medicines: {len(medicine_alia)}")
json_str = json.dumps(medicine_alia, indent=4, ensure_ascii=False)
with open("medicine_alia.json", 'w', encoding="utf-8") as json_file:
json_file.write(json_str)
部分结果
{
"八角枫": [
"(白金条[侧根名]",
"白龙须[须状根名]",
"八角王",
"八角梧桐",
"八角将军",
"割舌罗",
"五角枫",
"七角枫",
"野罗桐",
"花冠木)"
],
"八角茴香": [
"(大茴香",
"大料",
"五香八角",
"舶上茴香",
"舶茴香",
"茴香八角珠",
"八角香",
"八角大茴",
"八角",
"原油茴",
"大八角",
"八角珠",
"八月珠)"
],
"八角金盘": [
"(手树",
"金刚纂)"
],
...
"紫珠": [
"(紫珠草",
"止血草",
"紫荆",
"粗糠仔",
"鸦鹊板",
"螃蟹目",
"雅目草",
"白毛柴",
"白奶雪草)"
],
"紫珠叶": [
"(大风叶",
"白狗肠",
"大叶紫珠)"
],
"自然铜": [
"(石髓铅",
"方块铜)"
]
}