'''
Created on 2018年8月3日
@author: jied
'''
import pymysql
import traceback
import time
from . import DataConfig as dc
#数据库操作类
class DataBaseOp(object):
db=None
def __init__(self):
pass
#self.dbhost='47.92.26.13'
#self.dbport=3306
#self.dbname='gqu_Reptilian'
#self.dbuser='proot'
#self.dbpassword='gqy@8093'
#self.dbcharset='utf8'
def getconn(self):
#global db
if(DataBaseOp.db==None):
DataBaseOp.db = pymysql.connect(host=dc.dbhost,user=dc.dbuser, password=dc.dbpassword, port=dc.dbport,db=dc.dbname)
return DataBaseOp.db
def delete(self, tablename, where):
check = 0
sql = "delete from " + tablename
sql = sql + where
print(sql)
db = DataBaseOp.getconn(self);
cursor = db.cursor()
print(sql)
try:
cursor.execute(sql)
db.commit()
check = 1
except Exception as e:
db.rollback()
print('delete Error')
check = -1
# traceback.print_exc()
db.close()
raise SyntaxError('delete error')
db.close()
print('delete success')
return check
def update(self,tablename,where,**props):
check=0
sql="update "+tablename+" set "
for p in props:
val = str(DataBaseOp.strFilter(self,props.get(p)))
sql = sql+p+"='"+val+"',"
sql=sql[0:len(sql)-1]
sql=sql+where
DataBaseOp.getconn(self);
cursor = DataBaseOp.db.cursor()
#print(sql)
try:
cursor.execute(sql)
DataBaseOp.db.commit()
check=1
except Exception as e:
DataBaseOp.db.rollback()
print('update Error')
check=-1
#db.close()
#raise SyntaxError('update error')
#traceback.print_exc()
#db.close()
return check
def insertSql(self,sql):
db=DataBaseOp.getconn(self)
cursor=db.cursor()
check=0
try:
if cursor.execute(sql):
print('Insert Successful')
db.commit()
check = 1
except Exception as e:
db.rollback()
check = -1
print('insertData Failed')
# traceback.print_exc()
db.close()
return check
'''
插入数据 tablename:表名
props:属性名
'''
def insert(self,tablename,**props):
DataBaseOp.getconn(self)
check=0
sql='insert into '+tablename+'('
for p in props:
sql=sql+p+','
sql=sql[0:len(sql)-1]
sql=sql+')'
sql=sql+' values ('
for p in props:
val=str(DataBaseOp.strFilter(self,props.get(p)))
sql=sql+"'"+val+"',"
sql=sql[0:len(sql)-1]
sql=sql+')'
#db=DataBaseOp.getconn(self)
cursor = DataBaseOp.db.cursor()
print(sql)
try:
if cursor.execute(sql):
#print('Insert Successful')
DataBaseOp.db.commit()
check=1
except Exception as e:
DataBaseOp.db.rollback()
check=-1
print('insertData Failed')
#raise SyntaxError('insert error')
#traceback.print_exc()
finally:
pass
#db.close()
return check
def findAll(self,tablename,startindex,endindex,column,where):
sql=''
if(len(column)>0):
sql="select "+column+" from "+tablename+where+" limit "+str(startindex)+","+str(endindex)
else:
sql="select * from "+tablename+where+" limit "+str(startindex)+","+str(endindex)
print(sql)
return DataBaseOp.find(self, sql)
def findBetweenAndId(self,tablename,columns,startid,endid):
if(len(columns)>0):
sql="select "+columns+" from "+tablename+" t WHERE t.id BETWEEN "+str(startid)+" AND "+str(endid)
else:
sql="select * from "+tablename+" t WHERE t.id BETWEEN "+str(startid)+" AND "+str(endid)
return DataBaseOp.find(self, sql)
def findCount(self,tablename):
sql="select count(0) from "+tablename+" t "
return DataBaseOp.find(self, sql)[0][0]
def findCountBySql(self,sql):
return DataBaseOp.find(self,sql)[0][0]
def findMaxId(self,tablename):
sql = "select max(t.id) from " + tablename + " t "
return DataBaseOp.find(self, sql)[0]
def findOneById(self,tablename,id):
sql="select * from "+tablename+" where id='"+id+"'"
return DataBaseOp.find(self, sql)
def find(self,sql):
list=[]
DataBaseOp.getconn(self)
cursor = DataBaseOp.db.cursor()
try:
cursor.execute(sql)
results=cursor.fetchall()
for row in results:
list.append(row)
#print(row)
except Exception as e:
print('find Error')
#traceback.print_exc()
#db.close()
#print("---------------list------------------")
#print(list)
return list
def findResults(self,sql):
#list=[]
DataBaseOp.getconn(self)
cursor = DataBaseOp.db.cursor()
try:
cursor.execute(sql)
results=cursor.fetchall()
return results
#for row in results:
# list.append(row)
#print(row)
except Exception as e:
print('find Error')
#traceback.print_exc()
#db.close()
#print("---------------list------------------")
#print(list)
return list
def strFilter(self,value):
val = value
if val == None or val == 'None':
val = ''
return val
'''
插入数据 tablename:表名
props:属性名
'''
def myinsert(self, tablename, **props):
DataBaseOp.getconn(self)
check = 0
sql = 'insert into ' + tablename + '('
for p in props:
sql = sql + p + ','
sql = sql[0:len(sql) - 1]
sql = sql + ')'
sql = sql + ' values ('
for p in props:
val = str(DataBaseOp.strFilter(self, props.get(p)))
sql = sql + "'" + val + "',"
sql = sql[0:len(sql) - 1]
sql = sql + ')'
# db=DataBaseOp.getconn(self)
cursor = DataBaseOp.db.cursor()
print(sql)
try:
if cursor.execute(sql):
# print('Insert Successful')
DataBaseOp.db.commit()
check = 1
except Exception as e:
DataBaseOp.db.rollback()
check = -1
print(sql)
print('insertData Failed')
# raise SyntaxError('insert error')
# traceback.print_exc()
finally:
pass
# db.close()
return check
#bataBaseOp = DataBaseOp()
'''
bataBaseOp.insert('t_comp_patent', patentName='aa',
applicationNum='ab',
apilData='ac',
publicNum='ad',
publicDate='ae',
applicant='af',
Inventor='ag',
patentType='ah',
classNum='ai',
patentAgency='aj',
agent='ak',
detailedInfo='al',
imgurl='am',
compId='an')
'''
#list=bataBaseOp.findOneById('t_bk_comp', '10')
#print(list)
DataBaseOp.py随便放在一个包下,然后改改你的数据库getconn()
'''
<大吉大利,没有bug>
'''
# encoding=utf-8
from selenium import webdriver
import pandas as pd
import json
import time
import datetime
import re
from database import DataBaseOp as dbop
from pyquery import PyQuery as pq
from selenium.webdriver.common.by import By
# 历史学系
class Crawling(object):
def __init__(self):
# 人才计划
self.url = 'http://www.sard.ruc.edu.cn/szll/zzjs/qzjs/index.htm'
pass
def getDriver(self, url):
options = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2} # 设置无图模式
options.add_experimental_option("prefs", prefs) # 加载无图模式设置
options.add_argument("--disable-extensions")
options.add_argument("--disable-gpu")
# options.add_argument("--no-sandbox") # linux only
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
driver = webdriver.Chrome(options=options)
driver.execute_cdp_cmd("Network.enable", {})
driver.execute_cdp_cmd("Network.setExtraHTTPHeaders", {"headers": {"User-Agent": "browserClientA"}})
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
try:
driver.get(url)
driver.implicitly_wait(25)
driver.maximize_window()
except BaseException as e:
print('url error')
return driver
# 处理图片
def getImg(self):
pass
def readPage(self):
listurl = [
'http://phi.ruc.edu.cn/jytd/jszl/azyll/qb/index1.htm',
'http://phi.ruc.edu.cn/jytd/jszl/azyll/qb/index2.htm',
'http://phi.ruc.edu.cn/jytd/jszl/azyll/qb/index3.htm',
'http://phi.ruc.edu.cn/jytd/jszl/azyll/qb/index4.htm',
'http://phi.ruc.edu.cn/jytd/jszl/azyll/qb/index.htm',
]
list = []
for index in range(len(listurl)):
print('第' + str(index) + '页')
driver = Crawling.getDriver(self, listurl[index])
time.sleep(2)
divs = driver.find_elements(By.CLASS_NAME, 'row')
# print(divs)
# print(divs[0])
for idivs in divs:
divss = idivs.find_elements(By.TAG_NAME,"a")
for main in divss:
href = main.get_attribute("href")
name = main.find_element(By.TAG_NAME,"h5").text
jon_title = main.find_element(By.TAG_NAME,"p").text
p = []
p.append(href)
p.append(name)
p.append(jon_title)
print(p)
list.append(p)
pass
# time.sleep(1000)
print(list)
# time.sleep(1000)
university = '中国人民大学'
dept = '哲学院'
# headtag =''
for index in range(len(list)):
name = list[index][1]
href = list[index][0]
img = ''
honors_awards = ''
degree = ''
introduction = ''
tel = ''
sex = ''
birthday = ''
research_direction = ''
job_title = list[index][2]
job = ''
paper = ''
research_project = ''
email = ''
sql = "select count(t.id) from yxj_t_expert t where t.pageurl='" + href + "' "
count = dbop.DataBaseOp().findCountBySql(sql)
if count > 0:
continue
driver = Crawling.getDriver(self, href)
try:
time.sleep(2)
img = driver.find_element(By.XPATH,'/html/body/div[6]/div/div[2]/div[2]/div[1]/div[1]').get_attribute('style')
except BaseException as e:
print('error img')
try:
time.sleep(2)
a = driver.find_element(By.CLASS_NAME, "teacherDisplayText")
by = a.find_elements(By.TAG_NAME, "span")
for bys in by:
if bys.text.startswith('职称'):
job_title = bys.text
except BaseException as e:
print('error job_title')
try:
time.sleep(2)
a = driver.find_element(By.CLASS_NAME, "teacherDisplayText")
by = a.find_elements(By.TAG_NAME, "span")
for bys in by:
if bys.text.startswith('email'):
email = bys.text
except BaseException as e:
print('error email')
try:
a = driver.find_element(By.CLASS_NAME, "teacherDisplayText")
by = a.find_elements(By.TAG_NAME, "span")
for bys in by:
if bys.text.startswith('联系电话'):
tel = bys.text
except BaseException as e:
print('error tel')
try:
time.sleep(2)
introduction = driver.find_element(By.XPATH,
'/html/body/div[6]/div/div[2]/div[2]/div[3]/div[1]/p[1]').text
except BaseException as e:
print('error introduction')
html = ''
try:
# doc=PyQuery(driver.page_source)
# print(doc.text())
html = driver.page_source
html = html.replace("'", '"')
except BaseException as e:
print('error html')
dbop.DataBaseOp().insert('yxj_t_expert', university=university,
headtag='',
dept=dept,
name=name,
tel=tel,
level='',
sex=sex,
introduction=introduction,
job_title=job_title,
job=job,
birthday=birthday,
img=img,
degree=degree,
email=email,
pageurl=href,
research_direction=research_direction,
html=html,
education='',
research_project=research_project,
paper=paper,
honors_awards='',
awards='',
research_findings='',
work_experience='',
subject='',
main_part_job='',
main_works='',
createtime=datetime.datetime.now())
driver.close()
c = Crawling()
c.readPage()
这里有啥字段加啥字段 自己看着加