1. 简述
1.1 整体目标
- 目标网站:http://www.pesrmyy.com/huamdisease/List#0|01
- 爬取主要部位-细节部位-症状(信息)-相关疾病(信息)
- 相关数据保存在MySQL中。
1.2 主要内容
- beautifulsoup
- 无界面浏览器,爬取动态内容
- 连接数据库
2. 代码
2.1 网页相关
# -*- coding:utf-8 -*-
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium.webdriver.chrome.options import Options
import puerTest1
url_ori = "http://www.pesrmyy.com/HuamDisease/List#"
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options = chrome_options) #使用无界面的chrome浏览器
browser.get(url_ori)
time.sleep(2)
source = browser.page_source
soup = BeautifulSoup(source, "html5lib")
generalArea = ''
detailArea = ''
div = soup.find('div', attrs={"class", "block_left"})
ul = div.find('ul')
for li in ul.find_all('li'):
if(not li.ul is None):
generalArea = li.a.next_element
data_id = li['data-id']
ul_son = li.ul
for li_son in ul_son.find_all('li'):
detailArea = li_son.string
data_id2 = li_son['data-id']
url = url_ori + data_id + "|" + data_id2
print(generalArea, detailArea, url)
puerTest1.page(generalArea, detailArea, url)
# -*- coding:utf-8 -*-
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium.webdriver.chrome.options import Options
import puerMysql0
import puerTest2
import MySQLdb
def page(generalArea, detailArea, url): #对每个详细部位的页面进行处理
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options = chrome_options) #使用无界面的chrome浏览器
browser.get(url)
time.sleep(5) #页面为动态加载,适当设置延时,以获得完整的网页源码
source = browser.page_source
soup = BeautifulSoup(source, "html5lib")
url_main = 'http://www.pesrmyy.com'
div = soup.find('div', attrs={"class", "right_content"})
for li in div.ul.find_all('li'):
zhengzhuang = li.a.h1.string
href = li.a['href']
'''
id = href[16:20] #当运行过程意外中断,修复后重启时可使用,避免重复爬取,节省时间
id = int(id)
if(id < 4230):
continue
'''
conn = MySQLdb.connect(
host='127.0.0.1',
port=3306,
user='root',
passwd='123456',
db='puerhospital',
charset='utf8',
)
puerMysql0.addToSql(generalArea, detailArea, zhengzhuang) # 这一步必须要添加
# 创建游标
cur = conn.cursor()
sql_str1 = ("SELECT symptom from symptom" + " WHERE symptom='%s'" % zhengzhuang)
exist = cur.execute(sql_str1)
cur.close()
conn.commit() # 提交事务
conn.close()
if (exist == 1): #如果此症状的详细页面已经爬取过
continue #对下一个症状进行处理
url_son = url_main + href
print("url_son:" + url_son)
puerTest2.sonpage(url_son)
browser.close()
# -*- coding:utf-8 -*-
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium.webdriver.chrome.options import Options
import puerTest3
import puerMysql0
import puerMysql1
def sonpage(url): #对症状的页面进行处理
url_main = "http://www.pesrmyy.com"
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options = chrome_options) #使用无界面的chrome浏览器
browser.get(url)
time.sleep(2)
soup = BeautifulSoup(browser.page_source, "html5lib")
div_right_head = soup.find('div', class_ = "block_right_head") #寻找症状名称
div_title = div_right_head.find('div', class_ = "title")
zhengZhuang = div_title.div.string
xiangGuanJiBing = [[], []] # 前者是疾病名称,后面是网页链接
#要先初始化,否则可能网页中此项没有内容,从而下一步添加时根本就还没有这一项
miaoShu = ""
qiYin = ""
zhenDuan = ""
div_son = soup.find('div', attrs={"class", "right_content"})
for li_son in div_son.ul.find_all('li'):
if(li_son.h1 is None):
for p in li_son.find_all('p'):
if(p.string is None):
continue
miaoShu = p.string
elif(li_son.h1.string == "症状起因"): #这里同一级网页的源码也很不规范,需要再处理
for p in li_son.find_all('p'):
if(p.string is None):
continue
qiYin = p.string
elif(li_son.h1.string == "症状诊断"):
for p in li_son.find_all('p'):
if(p.string is None):
continue
zhenDuan = p.string
elif(li_son.h1.string == "相关疾病"):
for a in li_son.find_all('a'):
xiangGuanJiBing[0].append(a.string)
href = a['href']
diseaseUrl = url_main + href
xiangGuanJiBing[1].append(diseaseUrl)
for i in range(0, len(xiangGuanJiBing[0])):
puerTest3.sonpageJibing(xiangGuanJiBing[0][i], str(xiangGuanJiBing[1][i]))
puerMysql1.addToSql(zhengZhuang, miaoShu, qiYin, zhenDuan, url, xiangGuanJiBing[0])
browser.close()
# -*- coding:utf-8 -*-
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium.webdriver.chrome.options import Options
import puerMysql2
import MySQLdb
def sonpageJibing(jibing, url): #对疾病网页的内容进行爬取
conn = MySQLdb.connect(
host='127.0.0.1',
port=3306,
user='root',
passwd='123456',
db='puerhospital',
charset='utf8',
)
# 创建游标
cur = conn.cursor()
sql_str1 = ("SELECT disease from disease" + " WHERE disease='%s'" % jibing)
exist = cur.execute(sql_str1)
cur.close()
conn.commit() #提交事务
conn.close()
if(exist == 1):
return
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options = chrome_options) #使用无界面的chrome浏览器
browser.get(url)
time.sleep(3)
soup = BeautifulSoup(browser.page_source, "html5lib")
#要先初始化,否则可能网页中此项没有内容,从而下一步添加时根本就还没有这一项
intruduction = ''
yibao = ''
chuanranxing = ''
duofarenqun = ''
feiyong = ''
zhiyulu = ''
fangfa = ''
relajiancha = ''
yaopin = ''
zhouqi = ''
keshi = [] # 推荐的就诊科室可能会有多个
bingyin = ''
yufang = ''
bingfazheng = ''
zhengzhuang = ''
jianbie = ''
zhiliao = ''
huli = ''
yinshi = ''
div = soup.find('div', attrs={"class", "right_content"})
for li in div.ul.find_all('li'):
if (li.h1 is None):
intruduction = li.p.string
elif ('诊疗知识' in li.h1.string ):
for div in li.find_all('div'):
if ('医保' in div.next_element): # 一个标签和它的子标签之间的内容
yibao = div.span.string
elif ('传染' in div.next_element):
chuanranxing = div.span.string
elif ('多发人群' in div.next_element):
duofarenqun = div.span.string
elif ('治疗费用' in div.next_element):
feiyong = div.span.string
elif ('治愈率' in div.next_element):
zhiyulu = div.span.string
elif ('治疗方法' in div.next_element):
fangfa = div.span.string
elif ('相关检查' in div.next_element):
relajiancha = div.span.string
elif ('常用药品' in div.next_element):
yaopin = div.span.string
elif ('治疗周期' in div.next_element):
zhouqi = div.span.string
elif ('就诊科室' in li.h1.string):
for a in li.find_all('a'):
#if(not li.a is None):
keshi.append(a.string)
elif ('病因' in li.h1.string):
bingyin = li.p.string
elif ('预防' in li.h1.string):
yufang = li.p.string
elif ('并发症' in li.h1.string):
bingfazheng = li.p.string
elif ('症状' in li.h1.string):
zhengzhuang = li.p.string
elif ('诊断鉴别' in li.h1.string):
jianbie = li.p.string
elif ('治疗' in li.h1.string):
zhiliao = li.p.string
elif ('护理' in li.h1.string):
huli = li.p.string
elif ('饮食' in li.h1.string):
yinshi = li.p.string
'''
print('疾病:'+ jibing)
print('链接:'+ url)
print('介绍:'+ intruduction)
print('haha')
print('医保:'+ yibao)
print('传染性:'+ chuanranxing)
print('多发人群:'+ duofarenqun)
print('费用:'+ feiyong)
print('治愈率:'+ zhiyulu)
print('方法:', fangfa)
print('相关检查:'+ relajiancha)
print('常用药品:'+ yaopin)
print('周期:'+ zhouqi)
print('haha2')
print('就诊科室:', keshi)
print('病因:'+ bingyin)
print('预防:'+ yufang)
print('并发症:'+ bingfazheng)
print('症状:'+ zhengzhuang)
print('鉴别:'+ jianbie)
print('治疗:'+ zhiliao)
print('护理:'+ huli)
print('饮食:'+ yinshi)
'''
puerMysql2.addToSql(jibing, url, intruduction, yibao, chuanranxing, duofarenqun, feiyong, zhiyulu, fangfa, relajiancha, yaopin, zhouqi, keshi, bingyin, yufang, bingfazheng, zhengzhuang, jianbie, zhiliao, huli, yinshi)
browser.close()
2.2 数据库相关
# -*- coding:utf-8 -*-
import MySQLdb
def addToSql(generalArea, detailArea, zhengZhuang):
conn = MySQLdb.connect(
host='127.0.0.1',
port=3306,
user='root',
passwd='123456',
db='puerhospital',
charset = 'utf8',
)
#创建游标
cur = conn.cursor()
sql_str1 = ("INSERT INTO area" + (
" VALUES ('%s', '%s', '%s')" % (generalArea, detailArea, zhengZhuang)))
cur.execute(sql_str1)
cur.close()
conn.commit() #提交事务
conn.close()
# -*- coding:utf-8 -*-
import MySQLdb
def addToSql(zhengZhuang, miaoShu, qiYin, zhenDuan, url, xiangGuanJiBing):
conn = MySQLdb.connect(
host='127.0.0.1',
port=3306,
user='root',
passwd='123456',
db='puerhospital',
charset = 'utf8',
)
#创建游标
cur = conn.cursor()
#处理字符串中包含的单引号
miaoShu = miaoShu.replace("'", "\"");
qiYin = qiYin.replace("'", "\"");
zhenDuan = zhenDuan.replace("'", "\"");
sql_str2 = ("INSERT INTO symptom (symptom, des, reason, diagnosis, link)"
+ (" VALUES ('%s', '%s', '%s', '%s', '%s')" % (zhengZhuang, miaoShu, qiYin, zhenDuan, url)))
cur.execute(sql_str2)
for jibing in xiangGuanJiBing:
sql_str3 = ("INSERT INTO reladisease (symptom, relaDisease)" + (
" VALUES ('%s', '%s')" % (zhengZhuang, jibing)))
cur.execute(sql_str3)
cur.close()
conn.commit() #提交事务
conn.close()
# -*- coding:utf-8 -*-
import MySQLdb
def addToSql(jibing, url, intruduction, yibao, chuanranxing, duofarenqun, feiyong, zhiyulu, fangfa, relajiancha, yaopin, zhouqi, keshi, bingyin, yufang, bingfazheng, zhengzhuang, jianbie, zhiliao, huli, yinshi):
conn = MySQLdb.connect(
host='127.0.0.1',
port=3306,
user='root',
passwd='123456',
db='puerhospital',
charset = 'utf8',
)
#创建游标
cur = conn.cursor()
# 处理字符串中包含的单引号
# ……………………
if(not intruduction is None):
intruduction = intruduction.replace("'", "\"")
if(not bingyin is None):
bingyin = bingyin.replace("'", "\"")
if(not yufang is None):
yufang = yufang.replace("'", "\"")
if(not bingfazheng is None):
bingfazheng = bingfazheng.replace("'", "\"")
if(not zhengzhuang is None):
zhengzhuang = zhengzhuang.replace("'", "\"")
if(not jianbie is None):
jianbie = jianbie.replace("'", "\"")
if(not zhiliao is None):
zhiliao = zhiliao.replace("'", "\"")
if(not huli is None):
huli = huli.replace("'", "\"")
if(not yinshi is None):
yinshi = yinshi.replace("'", "\"")
sql_str1 = ("INSERT INTO disease" + (
" VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" %
(jibing, url, intruduction, yibao, chuanranxing, duofarenqun, feiyong,
zhiyulu, fangfa, relajiancha, yaopin, zhouqi, bingyin, yufang, bingfazheng, zhengzhuang,
jianbie, zhiliao, huli, yinshi)))
cur.execute(sql_str1)
for i in keshi:
sql_str2 = ("INSERT INTO keshi (disease, keshi)" + (
" VALUES ('%s', '%s')" % (jibing, i)))
cur.execute(sql_str2)
cur.close()
conn.commit() #提交事务
conn.close()