python爬虫记录

1. 简述

1.1 整体目标

  1. 目标网站:http://www.pesrmyy.com/huamdisease/List#0|01
  2. 爬取主要部位-细节部位-症状(信息)-相关疾病(信息)
  3. 相关数据保存在MySQL中。

1.2 主要内容

  1. beautifulsoup
  2. 无界面浏览器,爬取动态内容
  3. 连接数据库

2. 代码

2.1 网页相关

2.1.1 puerTest0.py

# -*- coding:utf-8 -*-

from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium.webdriver.chrome.options import Options
import puerTest1

url_ori = "http://www.pesrmyy.com/HuamDisease/List#"
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options = chrome_options)  #使用无界面的chrome浏览器
browser.get(url_ori)
time.sleep(2)
source  = browser.page_source
soup = BeautifulSoup(source, "html5lib")

generalArea = ''
detailArea = ''
div = soup.find('div', attrs={"class", "block_left"})
ul = div.find('ul')
for li in ul.find_all('li'):
    if(not li.ul is None):
        generalArea = li.a.next_element
        data_id = li['data-id']
        ul_son = li.ul
        for li_son in ul_son.find_all('li'):
            detailArea = li_son.string
            data_id2 = li_son['data-id']
            url = url_ori + data_id + "|" + data_id2
            print(generalArea, detailArea, url)
            puerTest1.page(generalArea, detailArea, url)

2.1.2 puerTest1.py

# -*- coding:utf-8 -*-

from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium.webdriver.chrome.options import Options
import puerMysql0
import puerTest2
import MySQLdb


def page(generalArea, detailArea, url):    #对每个详细部位的页面进行处理
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    browser = webdriver.Chrome(chrome_options = chrome_options)  #使用无界面的chrome浏览器
    browser.get(url)
    time.sleep(5)         #页面为动态加载,适当设置延时,以获得完整的网页源码
    source  = browser.page_source
    soup = BeautifulSoup(source, "html5lib")

    url_main = 'http://www.pesrmyy.com'
    div = soup.find('div', attrs={"class", "right_content"})
    for li in div.ul.find_all('li'):
        zhengzhuang = li.a.h1.string
        href = li.a['href']
        '''
        id = href[16:20]        #当运行过程意外中断,修复后重启时可使用,避免重复爬取,节省时间  
        id = int(id)
        if(id < 4230):
            continue 
        '''
        conn = MySQLdb.connect(
            host='127.0.0.1',
            port=3306,
            user='root',
            passwd='123456',
            db='puerhospital',
            charset='utf8',
        )

        puerMysql0.addToSql(generalArea, detailArea, zhengzhuang)  # 这一步必须要添加

        # 创建游标
        cur = conn.cursor()
        sql_str1 = ("SELECT symptom from symptom" + " WHERE symptom='%s'" % zhengzhuang)
        exist = cur.execute(sql_str1)
        cur.close()
        conn.commit()  # 提交事务
        conn.close()
        if (exist == 1):              #如果此症状的详细页面已经爬取过
            continue        #对下一个症状进行处理

        url_son = url_main + href
        print("url_son:" + url_son)

        puerTest2.sonpage(url_son)

    browser.close()

2.1.3 puerTest2.py

# -*- coding:utf-8 -*-

from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium.webdriver.chrome.options import Options
import puerTest3
import puerMysql0
import puerMysql1


def sonpage(url):          #对症状的页面进行处理
    url_main = "http://www.pesrmyy.com"
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    browser = webdriver.Chrome(chrome_options = chrome_options)  #使用无界面的chrome浏览器
    browser.get(url)
    time.sleep(2)
    soup = BeautifulSoup(browser.page_source, "html5lib")

    div_right_head = soup.find('div', class_ = "block_right_head")   #寻找症状名称
    div_title = div_right_head.find('div', class_ = "title")
    zhengZhuang = div_title.div.string

    xiangGuanJiBing = [[], []]     # 前者是疾病名称,后面是网页链接

    #要先初始化,否则可能网页中此项没有内容,从而下一步添加时根本就还没有这一项
    miaoShu = ""
    qiYin = ""
    zhenDuan = ""

    div_son = soup.find('div', attrs={"class", "right_content"})
    for li_son in div_son.ul.find_all('li'):
        if(li_son.h1 is None):
            for p in li_son.find_all('p'):
                if(p.string is None):
                    continue
                miaoShu = p.string
        elif(li_son.h1.string == "症状起因"):        #这里同一级网页的源码也很不规范,需要再处理
            for p in li_son.find_all('p'):
                if(p.string is None):
                    continue
                qiYin = p.string
        elif(li_son.h1.string == "症状诊断"):
            for p in li_son.find_all('p'):
                if(p.string is None):
                    continue
                zhenDuan = p.string
        elif(li_son.h1.string == "相关疾病"):
            for a in li_son.find_all('a'):
                xiangGuanJiBing[0].append(a.string)
                href = a['href']
                diseaseUrl = url_main + href
                xiangGuanJiBing[1].append(diseaseUrl)

    for i in range(0, len(xiangGuanJiBing[0])):
        puerTest3.sonpageJibing(xiangGuanJiBing[0][i], str(xiangGuanJiBing[1][i]))

    puerMysql1.addToSql(zhengZhuang, miaoShu, qiYin, zhenDuan, url, xiangGuanJiBing[0])

    browser.close()

2.1.4 puerTest3.py

# -*- coding:utf-8 -*-

from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium.webdriver.chrome.options import Options
import puerMysql2
import MySQLdb


def sonpageJibing(jibing, url):       #对疾病网页的内容进行爬取
    conn = MySQLdb.connect(
        host='127.0.0.1',
        port=3306,
        user='root',
        passwd='123456',
        db='puerhospital',
        charset='utf8',
    )

    # 创建游标
    cur = conn.cursor()

    sql_str1 = ("SELECT disease from disease" + " WHERE disease='%s'" % jibing)
    exist = cur.execute(sql_str1)
    cur.close()
    conn.commit()     #提交事务
    conn.close()
    if(exist == 1):
        return

    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    browser = webdriver.Chrome(chrome_options = chrome_options)  #使用无界面的chrome浏览器
    browser.get(url)
    time.sleep(3)
    soup = BeautifulSoup(browser.page_source, "html5lib")

    #要先初始化,否则可能网页中此项没有内容,从而下一步添加时根本就还没有这一项
    intruduction = ''
    yibao = ''
    chuanranxing = ''
    duofarenqun = ''
    feiyong = ''
    zhiyulu = ''
    fangfa = ''
    relajiancha = ''
    yaopin = ''
    zhouqi = ''

    keshi = []      # 推荐的就诊科室可能会有多个
    bingyin = ''
    yufang = ''
    bingfazheng = ''
    zhengzhuang = ''
    jianbie = ''
    zhiliao = ''
    huli = ''
    yinshi = ''

    div = soup.find('div', attrs={"class", "right_content"})
    for li in div.ul.find_all('li'):
        if (li.h1 is None):
            intruduction = li.p.string
        elif ('诊疗知识' in li.h1.string ):
            for div in li.find_all('div'):
                if ('医保' in div.next_element):      # 一个标签和它的子标签之间的内容
                    yibao = div.span.string
                elif ('传染' in div.next_element):
                    chuanranxing = div.span.string
                elif ('多发人群' in div.next_element):
                    duofarenqun = div.span.string
                elif ('治疗费用' in div.next_element):
                    feiyong = div.span.string
                elif ('治愈率' in div.next_element):
                    zhiyulu = div.span.string
                elif ('治疗方法' in div.next_element):
                    fangfa = div.span.string
                elif ('相关检查' in div.next_element):
                    relajiancha = div.span.string
                elif ('常用药品' in div.next_element):
                    yaopin = div.span.string
                elif ('治疗周期' in div.next_element):
                    zhouqi = div.span.string
        elif ('就诊科室' in li.h1.string):
            for a in li.find_all('a'):
            #if(not li.a is None):
                keshi.append(a.string)
        elif ('病因' in li.h1.string):
            bingyin = li.p.string
        elif ('预防' in li.h1.string):
            yufang = li.p.string
        elif ('并发症' in li.h1.string):
            bingfazheng = li.p.string
        elif ('症状' in li.h1.string):
            zhengzhuang = li.p.string
        elif ('诊断鉴别' in li.h1.string):
            jianbie = li.p.string
        elif ('治疗' in li.h1.string):
            zhiliao = li.p.string
        elif ('护理' in li.h1.string):
            huli = li.p.string
        elif ('饮食' in li.h1.string):
            yinshi = li.p.string
    '''
    print('疾病:'+ jibing)
    print('链接:'+ url)
    print('介绍:'+ intruduction)

    print('haha')

    print('医保:'+ yibao)
    print('传染性:'+ chuanranxing)
    print('多发人群:'+ duofarenqun)
    print('费用:'+ feiyong)
    print('治愈率:'+ zhiyulu)
    print('方法:', fangfa)
    print('相关检查:'+ relajiancha)
    print('常用药品:'+ yaopin)
    print('周期:'+ zhouqi)

    print('haha2')

    print('就诊科室:', keshi)
    print('病因:'+ bingyin)
    print('预防:'+ yufang)
    print('并发症:'+ bingfazheng)
    print('症状:'+ zhengzhuang)
    print('鉴别:'+ jianbie)
    print('治疗:'+ zhiliao)
    print('护理:'+ huli)
    print('饮食:'+ yinshi)
    '''

    puerMysql2.addToSql(jibing, url, intruduction, yibao, chuanranxing, duofarenqun, feiyong, zhiyulu, fangfa, relajiancha, yaopin, zhouqi, keshi, bingyin, yufang, bingfazheng, zhengzhuang, jianbie, zhiliao, huli, yinshi)

    browser.close()

2.2 数据库相关

2.2.1 puerMysql0.py

# -*- coding:utf-8 -*-

import MySQLdb


def addToSql(generalArea, detailArea, zhengZhuang):
    conn = MySQLdb.connect(
                host='127.0.0.1',
                port=3306,
                user='root',
                passwd='123456',
                db='puerhospital',
                charset = 'utf8',
    )

    #创建游标
    cur = conn.cursor()

    sql_str1 = ("INSERT INTO area" + (
                " VALUES ('%s', '%s', '%s')" % (generalArea, detailArea, zhengZhuang)))
    cur.execute(sql_str1)

    cur.close()
    conn.commit()     #提交事务
    conn.close()

2.2.2 puerMysql1.py

# -*- coding:utf-8 -*-

import MySQLdb


def addToSql(zhengZhuang, miaoShu, qiYin, zhenDuan, url, xiangGuanJiBing):
    conn = MySQLdb.connect(
                host='127.0.0.1',
                port=3306,
                user='root',
                passwd='123456',
                db='puerhospital',
                charset = 'utf8',
    )

    #创建游标
    cur = conn.cursor()

    #处理字符串中包含的单引号
    miaoShu = miaoShu.replace("'", "\"");
    qiYin = qiYin.replace("'", "\"");
    zhenDuan = zhenDuan.replace("'", "\"");
    sql_str2 = ("INSERT INTO symptom (symptom, des, reason, diagnosis, link)"
                + (" VALUES ('%s', '%s', '%s', '%s', '%s')" % (zhengZhuang, miaoShu, qiYin, zhenDuan, url)))
    cur.execute(sql_str2)

    for jibing in xiangGuanJiBing:
        sql_str3 = ("INSERT INTO reladisease (symptom, relaDisease)" + (
                " VALUES ('%s', '%s')" % (zhengZhuang, jibing)))
        cur.execute(sql_str3)

    cur.close()
    conn.commit()     #提交事务
    conn.close()

2.2.3 puerMysql2.py

# -*- coding:utf-8 -*-

import MySQLdb


def addToSql(jibing, url, intruduction, yibao, chuanranxing, duofarenqun, feiyong, zhiyulu, fangfa, relajiancha, yaopin, zhouqi, keshi, bingyin, yufang, bingfazheng, zhengzhuang, jianbie, zhiliao, huli, yinshi):
    conn = MySQLdb.connect(
                host='127.0.0.1',
                port=3306,
                user='root',
                passwd='123456',
                db='puerhospital',
                charset = 'utf8',
    )

    #创建游标
    cur = conn.cursor()

    # 处理字符串中包含的单引号
    #   ……………………
    if(not intruduction is None):
        intruduction = intruduction.replace("'", "\"")
    if(not bingyin is None):
        bingyin = bingyin.replace("'", "\"")
    if(not yufang is None):
        yufang = yufang.replace("'", "\"")
    if(not bingfazheng is None):
        bingfazheng = bingfazheng.replace("'", "\"")
    if(not zhengzhuang is None):
        zhengzhuang = zhengzhuang.replace("'", "\"")
    if(not jianbie is None):
        jianbie = jianbie.replace("'", "\"")
    if(not zhiliao is None):
        zhiliao = zhiliao.replace("'", "\"")
    if(not huli is None):
        huli = huli.replace("'", "\"")
    if(not yinshi is None):
        yinshi = yinshi.replace("'", "\"")

    sql_str1 = ("INSERT INTO disease" + (
                " VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" %
                (jibing, url, intruduction, yibao, chuanranxing, duofarenqun, feiyong,
                zhiyulu, fangfa, relajiancha, yaopin, zhouqi, bingyin, yufang, bingfazheng, zhengzhuang,
                jianbie, zhiliao, huli, yinshi)))
    cur.execute(sql_str1)


    for i in keshi:
        sql_str2 = ("INSERT INTO keshi (disease, keshi)" + (
                " VALUES ('%s', '%s')" % (jibing, i)))
        cur.execute(sql_str2)

    cur.close()
    conn.commit()     #提交事务
    conn.close()
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值