python爬取大学的一个小栗子

不二嘉ya

已于 2023-07-25 10:49:16 修改

阅读量62

点赞数

文章标签： python 开发语言数据库

于 2023-07-25 10:47:52 首次发布

本文链接：https://blog.csdn.net/weixin_65687276/article/details/131912506

版权

'''
Created on 2018年8月3日

@author: jied
'''
import pymysql
import traceback
import time
from . import DataConfig as dc




#数据库操作类
class DataBaseOp(object):

    db=None

    def __init__(self):
        pass
        #self.dbhost='47.92.26.13'
        #self.dbport=3306
        #self.dbname='gqu_Reptilian'
        #self.dbuser='proot'
        #self.dbpassword='gqy@8093'
        #self.dbcharset='utf8'
    
    def getconn(self):
        #global db
        if(DataBaseOp.db==None):
            DataBaseOp.db = pymysql.connect(host=dc.dbhost,user=dc.dbuser, password=dc.dbpassword, port=dc.dbport,db=dc.dbname)
        return DataBaseOp.db


    def delete(self, tablename, where):
        check = 0
        sql = "delete from " + tablename
        sql = sql + where
        print(sql)
        db = DataBaseOp.getconn(self);
        cursor = db.cursor()
        print(sql)
        try:
            cursor.execute(sql)
            db.commit()
            check = 1
        except Exception as e:
            db.rollback()
            print('delete Error')
            check = -1
            # traceback.print_exc()
            db.close()
            raise SyntaxError('delete error')
        db.close()
        print('delete success')
        return check

    def update(self,tablename,where,**props):
        check=0
        sql="update "+tablename+" set "
        for p in props:
            val = str(DataBaseOp.strFilter(self,props.get(p)))
            sql = sql+p+"='"+val+"',"
        
        sql=sql[0:len(sql)-1]
        sql=sql+where
        DataBaseOp.getconn(self);
        cursor = DataBaseOp.db.cursor()
        #print(sql)
        try:
            cursor.execute(sql)
            DataBaseOp.db.commit()
            check=1
        except Exception as e:
            DataBaseOp.db.rollback()
            print('update Error')
            check=-1
            #db.close()
            #raise SyntaxError('update error')
            #traceback.print_exc()
        #db.close()

        return check
        

    def insertSql(self,sql):
        db=DataBaseOp.getconn(self)
        cursor=db.cursor()
        check=0
        try:
            if cursor.execute(sql):
                print('Insert Successful')
                db.commit()
                check = 1
        except Exception as e:
            db.rollback()
            check = -1
            print('insertData Failed')
            # traceback.print_exc()
        db.close()
        return check

    '''
        插入数据 tablename：表名
        props:属性名
    '''
    def insert(self,tablename,**props):
        DataBaseOp.getconn(self)
        check=0
        sql='insert into '+tablename+'('
        for p in props:
            sql=sql+p+','
        
        sql=sql[0:len(sql)-1]
        sql=sql+')'
        sql=sql+' values ('
        
        for p in props:
            val=str(DataBaseOp.strFilter(self,props.get(p)))
            sql=sql+"'"+val+"',"
            
        sql=sql[0:len(sql)-1]
        sql=sql+')'
        
        #db=DataBaseOp.getconn(self)
        cursor = DataBaseOp.db.cursor()
        print(sql)
        try:
          if cursor.execute(sql):
             #print('Insert Successful')
             DataBaseOp.db.commit()
             check=1
        except Exception as e:
            DataBaseOp.db.rollback()
            check=-1
            print('insertData Failed')
            #raise SyntaxError('insert error')
            #traceback.print_exc()
        finally:
            pass
            #db.close()

        return check
    
    def findAll(self,tablename,startindex,endindex,column,where):
        sql=''
        if(len(column)>0):
            sql="select "+column+" from "+tablename+where+" limit "+str(startindex)+","+str(endindex)
        else:
            sql="select * from "+tablename+where+" limit "+str(startindex)+","+str(endindex)
        print(sql)
        return DataBaseOp.find(self, sql)

    def findBetweenAndId(self,tablename,columns,startid,endid):
        if(len(columns)>0):
            sql="select "+columns+" from "+tablename+" t WHERE  t.id  BETWEEN "+str(startid)+" AND "+str(endid)
        else:
            sql="select * from "+tablename+" t WHERE  t.id  BETWEEN "+str(startid)+" AND "+str(endid)

        return DataBaseOp.find(self, sql)


    
    def findCount(self,tablename):
        sql="select count(0) from "+tablename+" t "
        return DataBaseOp.find(self, sql)[0][0]

    def findCountBySql(self,sql):
        return DataBaseOp.find(self,sql)[0][0]

    def findMaxId(self,tablename):
        sql = "select max(t.id) from " + tablename + " t "
        return DataBaseOp.find(self, sql)[0]
        
        
    def findOneById(self,tablename,id):
        sql="select * from "+tablename+" where id='"+id+"'"
        return DataBaseOp.find(self, sql)
    
    def find(self,sql):
        list=[]
        DataBaseOp.getconn(self)
        cursor = DataBaseOp.db.cursor()
        try:
           cursor.execute(sql)
           results=cursor.fetchall()
           for row in results:
               list.append(row)
               #print(row)
        except Exception as e:
            print('find Error')
            #traceback.print_exc()
            #db.close()
        #print("---------------list------------------")
        #print(list)
        return list


    def findResults(self,sql):
        #list=[]
        DataBaseOp.getconn(self)
        cursor = DataBaseOp.db.cursor()
        try:
           cursor.execute(sql)
           results=cursor.fetchall()
           return results
           #for row in results:
           #    list.append(row)
               #print(row)
        except Exception as e:
            print('find Error')
            #traceback.print_exc()
            #db.close()
        #print("---------------list------------------")
        #print(list)
        return list


    def strFilter(self,value):
        val = value
        if val == None or val == 'None':
            val = ''
        return val

    '''
          插入数据 tablename：表名
          props:属性名
      '''

    def myinsert(self, tablename, **props):
        DataBaseOp.getconn(self)
        check = 0
        sql = 'insert into ' + tablename + '('
        for p in props:
            sql = sql + p + ','

        sql = sql[0:len(sql) - 1]
        sql = sql + ')'
        sql = sql + ' values ('

        for p in props:
            val = str(DataBaseOp.strFilter(self, props.get(p)))
            sql = sql + "'" + val + "',"

        sql = sql[0:len(sql) - 1]
        sql = sql + ')'

        # db=DataBaseOp.getconn(self)
        cursor = DataBaseOp.db.cursor()
        print(sql)
        try:
            if cursor.execute(sql):
                # print('Insert Successful')
                DataBaseOp.db.commit()
                check = 1
        except Exception as e:
            DataBaseOp.db.rollback()
            check = -1
            print(sql)
            print('insertData Failed')
            # raise SyntaxError('insert error')
            # traceback.print_exc()
        finally:
            pass
            # db.close()

        return check
    
  
        
        
    
#bataBaseOp = DataBaseOp()
        
''' 
bataBaseOp.insert('t_comp_patent', patentName='aa',
  applicationNum='ab',
  apilData='ac',
  publicNum='ad',
  publicDate='ae',
  applicant='af',
  Inventor='ag',
  patentType='ah',
  classNum='ai',
  patentAgency='aj',
  agent='ak',
  detailedInfo='al',
  imgurl='am',
  compId='an')
'''
  

#list=bataBaseOp.findOneById('t_bk_comp', '10')
#print(list)

DataBaseOp.py随便放在一个包下，然后改改你的数据库getconn()

'''
<大吉大利,没有bug>
'''
# encoding=utf-8

from selenium import webdriver
import pandas as pd
import json
import time
import datetime
import re

from database import DataBaseOp as dbop

from pyquery import PyQuery as pq
from selenium.webdriver.common.by import By


# 历史学系
class Crawling(object):
    def __init__(self):
        # 人才计划
        self.url = 'http://www.sard.ruc.edu.cn/szll/zzjs/qzjs/index.htm'
        pass

    def getDriver(self, url):
        options = webdriver.ChromeOptions()
        prefs = {"profile.managed_default_content_settings.images": 2}  # 设置无图模式
        options.add_experimental_option("prefs", prefs)  # 加载无图模式设置
        options.add_argument("--disable-extensions")
        options.add_argument("--disable-gpu")
        # options.add_argument("--no-sandbox") # linux only
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option("useAutomationExtension", False)
        driver = webdriver.Chrome(options=options)
        driver.execute_cdp_cmd("Network.enable", {})
        driver.execute_cdp_cmd("Network.setExtraHTTPHeaders", {"headers": {"User-Agent": "browserClientA"}})
        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": """
                   Object.defineProperty(navigator, 'webdriver', {
                       get: () => undefined
                   })
               """
        })
        try:
            driver.get(url)
            driver.implicitly_wait(25)
            driver.maximize_window()
        except BaseException as e:
            print('url error')

        return driver

    # 处理图片
    def getImg(self):
        pass

    def readPage(self):
        listurl = [
            'http://phi.ruc.edu.cn/jytd/jszl/azyll/qb/index1.htm',
            'http://phi.ruc.edu.cn/jytd/jszl/azyll/qb/index2.htm',
            'http://phi.ruc.edu.cn/jytd/jszl/azyll/qb/index3.htm',
            'http://phi.ruc.edu.cn/jytd/jszl/azyll/qb/index4.htm',
            'http://phi.ruc.edu.cn/jytd/jszl/azyll/qb/index.htm',
        ]

        list = []

        for index in range(len(listurl)):
            print('第' + str(index) + '页')
            driver = Crawling.getDriver(self, listurl[index])
            time.sleep(2)

            divs = driver.find_elements(By.CLASS_NAME, 'row')
            # print(divs)
            # print(divs[0])
            for idivs in divs:
              divss = idivs.find_elements(By.TAG_NAME,"a")
              for main in divss:
                href = main.get_attribute("href")

                name = main.find_element(By.TAG_NAME,"h5").text

                jon_title = main.find_element(By.TAG_NAME,"p").text

                p = []
                p.append(href)
                p.append(name)
                p.append(jon_title)

                print(p)
                list.append(p)
                pass

        # time.sleep(1000)

        print(list)
        # time.sleep(1000)
        university = '中国人民大学'
        dept = '哲学院'

        # headtag =''

        for index in range(len(list)):

            name = list[index][1]
            href = list[index][0]
            img = ''
            honors_awards = ''
            degree = ''
            introduction = ''
            tel = ''
            sex = ''
            birthday = ''
            research_direction = ''
            job_title = list[index][2]

            job = ''
            paper = ''
            research_project = ''
            email = ''

            sql = "select count(t.id) from yxj_t_expert t where t.pageurl='" + href + "' "
            count = dbop.DataBaseOp().findCountBySql(sql)
            if count > 0:
                continue

            driver = Crawling.getDriver(self, href)


            try:
                time.sleep(2)
                img = driver.find_element(By.XPATH,'/html/body/div[6]/div/div[2]/div[2]/div[1]/div[1]').get_attribute('style')
            except BaseException as e:
                    print('error img')

            try:
                time.sleep(2)
                a = driver.find_element(By.CLASS_NAME, "teacherDisplayText")
                by = a.find_elements(By.TAG_NAME, "span")
                for bys in by:
                    if bys.text.startswith('职称'):
                        job_title = bys.text

            except BaseException as e:
                print('error job_title')

            try:
                time.sleep(2)
                a = driver.find_element(By.CLASS_NAME, "teacherDisplayText")
                by = a.find_elements(By.TAG_NAME, "span")
                for bys in by:
                    if bys.text.startswith('email'):
                        email = bys.text

            except BaseException as e:
                print('error email')
            
            try:
                a = driver.find_element(By.CLASS_NAME, "teacherDisplayText")
                by = a.find_elements(By.TAG_NAME, "span")
                for bys in by:
                    if bys.text.startswith('联系电话'):
                        tel = bys.text

            except BaseException as e:
                print('error tel')


            try:
                time.sleep(2)
                introduction = driver.find_element(By.XPATH,
                '/html/body/div[6]/div/div[2]/div[2]/div[3]/div[1]/p[1]').text
            except BaseException as e:
                print('error introduction')

            html = ''
            try:
                # doc=PyQuery(driver.page_source)
                # print(doc.text())
                html = driver.page_source
                html = html.replace("'", '"')

            except BaseException as e:
                print('error html')

         

            dbop.DataBaseOp().insert('yxj_t_expert', university=university,
                                     headtag='',
                                     dept=dept,
                                     name=name,
                                     tel=tel,
                                     level='',
                                     sex=sex,
                                     introduction=introduction,
                                     job_title=job_title,
                                     job=job,
                                     birthday=birthday,
                                     img=img,
                                     degree=degree,
                                     email=email,
                                     pageurl=href,
                                     research_direction=research_direction,
                                     html=html,
                                     education='',
                                     research_project=research_project,
                                     paper=paper,
                                     honors_awards='',
                                     awards='',
                                     research_findings='',
                                     work_experience='',
                                     subject='',
                                     main_part_job='',
                                     main_works='',
                                     createtime=datetime.datetime.now())

            driver.close()


c = Crawling()
c.readPage()

这里有啥字段加啥字段自己看着加