python爬取大学的一个小栗子

'''
Created on 2018年8月3日

@author: jied
'''
import pymysql
import traceback
import time
from . import DataConfig as dc




#数据库操作类
class DataBaseOp(object):

    db=None

    def __init__(self):
        pass
        #self.dbhost='47.92.26.13'
        #self.dbport=3306
        #self.dbname='gqu_Reptilian'
        #self.dbuser='proot'
        #self.dbpassword='gqy@8093'
        #self.dbcharset='utf8'
    
    def getconn(self):
        #global db
        if(DataBaseOp.db==None):
            DataBaseOp.db = pymysql.connect(host=dc.dbhost,user=dc.dbuser, password=dc.dbpassword, port=dc.dbport,db=dc.dbname)
        return DataBaseOp.db


    def delete(self, tablename, where):
        check = 0
        sql = "delete from " + tablename
        sql = sql + where
        print(sql)
        db = DataBaseOp.getconn(self);
        cursor = db.cursor()
        print(sql)
        try:
            cursor.execute(sql)
            db.commit()
            check = 1
        except Exception as e:
            db.rollback()
            print('delete Error')
            check = -1
            # traceback.print_exc()
            db.close()
            raise SyntaxError('delete error')
        db.close()
        print('delete success')
        return check

    def update(self,tablename,where,**props):
        check=0
        sql="update "+tablename+" set "
        for p in props:
            val = str(DataBaseOp.strFilter(self,props.get(p)))
            sql = sql+p+"='"+val+"',"
        
        sql=sql[0:len(sql)-1]
        sql=sql+where
        DataBaseOp.getconn(self);
        cursor = DataBaseOp.db.cursor()
        #print(sql)
        try:
            cursor.execute(sql)
            DataBaseOp.db.commit()
            check=1
        except Exception as e:
            DataBaseOp.db.rollback()
            print('update Error')
            check=-1
            #db.close()
            #raise SyntaxError('update error')
            #traceback.print_exc()
        #db.close()

        return check
        

    def insertSql(self,sql):
        db=DataBaseOp.getconn(self)
        cursor=db.cursor()
        check=0
        try:
            if cursor.execute(sql):
                print('Insert Successful')
                db.commit()
                check = 1
        except Exception as e:
            db.rollback()
            check = -1
            print('insertData Failed')
            # traceback.print_exc()
        db.close()
        return check

    '''
        插入数据 tablename:表名
        props:属性名
    '''
    def insert(self,tablename,**props):
        DataBaseOp.getconn(self)
        check=0
        sql='insert into '+tablename+'('
        for p in props:
            sql=sql+p+','
        
        sql=sql[0:len(sql)-1]
        sql=sql+')'
        sql=sql+' values ('
        
        for p in props:
            val=str(DataBaseOp.strFilter(self,props.get(p)))
            sql=sql+"'"+val+"',"
            
        sql=sql[0:len(sql)-1]
        sql=sql+')'
        
        #db=DataBaseOp.getconn(self)
        cursor = DataBaseOp.db.cursor()
        print(sql)
        try:
          if cursor.execute(sql):
             #print('Insert Successful')
             DataBaseOp.db.commit()
             check=1
        except Exception as e:
            DataBaseOp.db.rollback()
            check=-1
            print('insertData Failed')
            #raise SyntaxError('insert error')
            #traceback.print_exc()
        finally:
            pass
            #db.close()

        return check
    
    def findAll(self,tablename,startindex,endindex,column,where):
        sql=''
        if(len(column)>0):
            sql="select "+column+" from "+tablename+where+" limit "+str(startindex)+","+str(endindex)
        else:
            sql="select * from "+tablename+where+" limit "+str(startindex)+","+str(endindex)
        print(sql)
        return DataBaseOp.find(self, sql)

    def findBetweenAndId(self,tablename,columns,startid,endid):
        if(len(columns)>0):
            sql="select "+columns+" from "+tablename+" t WHERE  t.id  BETWEEN "+str(startid)+" AND "+str(endid)
        else:
            sql="select * from "+tablename+" t WHERE  t.id  BETWEEN "+str(startid)+" AND "+str(endid)

        return DataBaseOp.find(self, sql)


    
    def findCount(self,tablename):
        sql="select count(0) from "+tablename+" t "
        return DataBaseOp.find(self, sql)[0][0]

    def findCountBySql(self,sql):
        return DataBaseOp.find(self,sql)[0][0]

    def findMaxId(self,tablename):
        sql = "select max(t.id) from " + tablename + " t "
        return DataBaseOp.find(self, sql)[0]
        
        
    def findOneById(self,tablename,id):
        sql="select * from "+tablename+" where id='"+id+"'"
        return DataBaseOp.find(self, sql)
    
    def find(self,sql):
        list=[]
        DataBaseOp.getconn(self)
        cursor = DataBaseOp.db.cursor()
        try:
           cursor.execute(sql)
           results=cursor.fetchall()
           for row in results:
               list.append(row)
               #print(row)
        except Exception as e:
            print('find Error')
            #traceback.print_exc()
            #db.close()
        #print("---------------list------------------")
        #print(list)
        return list


    def findResults(self,sql):
        #list=[]
        DataBaseOp.getconn(self)
        cursor = DataBaseOp.db.cursor()
        try:
           cursor.execute(sql)
           results=cursor.fetchall()
           return results
           #for row in results:
           #    list.append(row)
               #print(row)
        except Exception as e:
            print('find Error')
            #traceback.print_exc()
            #db.close()
        #print("---------------list------------------")
        #print(list)
        return list


    def strFilter(self,value):
        val = value
        if val == None or val == 'None':
            val = ''
        return val

    '''
          插入数据 tablename:表名
          props:属性名
      '''

    def myinsert(self, tablename, **props):
        DataBaseOp.getconn(self)
        check = 0
        sql = 'insert into ' + tablename + '('
        for p in props:
            sql = sql + p + ','

        sql = sql[0:len(sql) - 1]
        sql = sql + ')'
        sql = sql + ' values ('

        for p in props:
            val = str(DataBaseOp.strFilter(self, props.get(p)))
            sql = sql + "'" + val + "',"

        sql = sql[0:len(sql) - 1]
        sql = sql + ')'

        # db=DataBaseOp.getconn(self)
        cursor = DataBaseOp.db.cursor()
        print(sql)
        try:
            if cursor.execute(sql):
                # print('Insert Successful')
                DataBaseOp.db.commit()
                check = 1
        except Exception as e:
            DataBaseOp.db.rollback()
            check = -1
            print(sql)
            print('insertData Failed')
            # raise SyntaxError('insert error')
            # traceback.print_exc()
        finally:
            pass
            # db.close()

        return check
    
  
        
        
    
#bataBaseOp = DataBaseOp()
        
''' 
bataBaseOp.insert('t_comp_patent', patentName='aa',
  applicationNum='ab',
  apilData='ac',
  publicNum='ad',
  publicDate='ae',
  applicant='af',
  Inventor='ag',
  patentType='ah',
  classNum='ai',
  patentAgency='aj',
  agent='ak',
  detailedInfo='al',
  imgurl='am',
  compId='an')
'''
  

#list=bataBaseOp.findOneById('t_bk_comp', '10')
#print(list)
  
        
        
        
        
        
        
DataBaseOp.py随便放在一个包下,然后改改你的数据库getconn()

'''
<大吉大利,没有bug>
'''
# encoding=utf-8

from selenium import webdriver
import pandas as pd
import json
import time
import datetime
import re

from database import DataBaseOp as dbop

from pyquery import PyQuery as pq
from selenium.webdriver.common.by import By


# 历史学系
class Crawling(object):
    def __init__(self):
        # 人才计划
        self.url = 'http://www.sard.ruc.edu.cn/szll/zzjs/qzjs/index.htm'
        pass

    def getDriver(self, url):
        options = webdriver.ChromeOptions()
        prefs = {"profile.managed_default_content_settings.images": 2}  # 设置无图模式
        options.add_experimental_option("prefs", prefs)  # 加载无图模式设置
        options.add_argument("--disable-extensions")
        options.add_argument("--disable-gpu")
        # options.add_argument("--no-sandbox") # linux only
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option("useAutomationExtension", False)
        driver = webdriver.Chrome(options=options)
        driver.execute_cdp_cmd("Network.enable", {})
        driver.execute_cdp_cmd("Network.setExtraHTTPHeaders", {"headers": {"User-Agent": "browserClientA"}})
        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": """
                   Object.defineProperty(navigator, 'webdriver', {
                       get: () => undefined
                   })
               """
        })
        try:
            driver.get(url)
            driver.implicitly_wait(25)
            driver.maximize_window()
        except BaseException as e:
            print('url error')

        return driver

    # 处理图片
    def getImg(self):
        pass

    def readPage(self):
        listurl = [
            'http://phi.ruc.edu.cn/jytd/jszl/azyll/qb/index1.htm',
            'http://phi.ruc.edu.cn/jytd/jszl/azyll/qb/index2.htm',
            'http://phi.ruc.edu.cn/jytd/jszl/azyll/qb/index3.htm',
            'http://phi.ruc.edu.cn/jytd/jszl/azyll/qb/index4.htm',
            'http://phi.ruc.edu.cn/jytd/jszl/azyll/qb/index.htm',
        ]

        list = []

        for index in range(len(listurl)):
            print('第' + str(index) + '页')
            driver = Crawling.getDriver(self, listurl[index])
            time.sleep(2)

            divs = driver.find_elements(By.CLASS_NAME, 'row')
            # print(divs)
            # print(divs[0])
            for idivs in divs:
              divss = idivs.find_elements(By.TAG_NAME,"a")
              for main in divss:
                href = main.get_attribute("href")

                name = main.find_element(By.TAG_NAME,"h5").text

                jon_title = main.find_element(By.TAG_NAME,"p").text

                p = []
                p.append(href)
                p.append(name)
                p.append(jon_title)

                print(p)
                list.append(p)
                pass

        # time.sleep(1000)

        print(list)
        # time.sleep(1000)
        university = '中国人民大学'
        dept = '哲学院'

        # headtag =''

        for index in range(len(list)):

            name = list[index][1]
            href = list[index][0]
            img = ''
            honors_awards = ''
            degree = ''
            introduction = ''
            tel = ''
            sex = ''
            birthday = ''
            research_direction = ''
            job_title = list[index][2]

            job = ''
            paper = ''
            research_project = ''
            email = ''

            sql = "select count(t.id) from yxj_t_expert t where t.pageurl='" + href + "' "
            count = dbop.DataBaseOp().findCountBySql(sql)
            if count > 0:
                continue

            driver = Crawling.getDriver(self, href)


            try:
                time.sleep(2)
                img = driver.find_element(By.XPATH,'/html/body/div[6]/div/div[2]/div[2]/div[1]/div[1]').get_attribute('style')
            except BaseException as e:
                    print('error img')

            try:
                time.sleep(2)
                a = driver.find_element(By.CLASS_NAME, "teacherDisplayText")
                by = a.find_elements(By.TAG_NAME, "span")
                for bys in by:
                    if bys.text.startswith('职称'):
                        job_title = bys.text

            except BaseException as e:
                print('error job_title')

            try:
                time.sleep(2)
                a = driver.find_element(By.CLASS_NAME, "teacherDisplayText")
                by = a.find_elements(By.TAG_NAME, "span")
                for bys in by:
                    if bys.text.startswith('email'):
                        email = bys.text

            except BaseException as e:
                print('error email')
            
            try:
                a = driver.find_element(By.CLASS_NAME, "teacherDisplayText")
                by = a.find_elements(By.TAG_NAME, "span")
                for bys in by:
                    if bys.text.startswith('联系电话'):
                        tel = bys.text

            except BaseException as e:
                print('error tel')


            try:
                time.sleep(2)
                introduction = driver.find_element(By.XPATH,
                '/html/body/div[6]/div/div[2]/div[2]/div[3]/div[1]/p[1]').text
            except BaseException as e:
                print('error introduction')

            html = ''
            try:
                # doc=PyQuery(driver.page_source)
                # print(doc.text())
                html = driver.page_source
                html = html.replace("'", '"')

            except BaseException as e:
                print('error html')

         

            dbop.DataBaseOp().insert('yxj_t_expert', university=university,
                                     headtag='',
                                     dept=dept,
                                     name=name,
                                     tel=tel,
                                     level='',
                                     sex=sex,
                                     introduction=introduction,
                                     job_title=job_title,
                                     job=job,
                                     birthday=birthday,
                                     img=img,
                                     degree=degree,
                                     email=email,
                                     pageurl=href,
                                     research_direction=research_direction,
                                     html=html,
                                     education='',
                                     research_project=research_project,
                                     paper=paper,
                                     honors_awards='',
                                     awards='',
                                     research_findings='',
                                     work_experience='',
                                     subject='',
                                     main_part_job='',
                                     main_works='',
                                     createtime=datetime.datetime.now())

            driver.close()


c = Crawling()
c.readPage()

这里有啥字段加啥字段 自己看着加

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值