import requests
from bs4 import BeautifulSoup
import datetime
import re
import pymysql
import datetime
#数据库封装
class Mydb():
def __init__(self):
try:
self.conn = pymysql.connect('127.0.0.1','root','123456','py11',charset='utf8')
self.cursor = self.conn.cursor()
except Exception as e:
print(e)
def execute(self,sql,data):
try:
res = self.cursor.execute(sql,data)
self.conn.commit()
except Exception as e:
self.conn.rollback()
print(e)
base_url = 'http://hr.tencent.com/position.php?start=%d'
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
# 处理详情页
def parse_detail(url):
response = requests.get(url,headers=headers)
html = BeautifulSoup(response.text,'lxml')
# 职位标题
postion_name = html.select('tr[class="h"]')[0].text.strip()
# 工作地点
info = html.select('table.tablelist tr')
location = info[1].select('td')[0].contents[-1]
p_type = info[1].select('td')[1].contents[-1]
p_number = info[1].select('td')[2].contents[-1].strip('人')
# 工作职责
duty_list = info[2].select('li')
duty_list = [duty.text for duty in duty_list]
duty = ''.join(duty_list)
# 工作要求
requirement = info[3].select('li')
requirement = [require.text for require in requirement]
requirement = ''.join(requirement)
# 获取系统时间
crawl_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 获取url_id
id_pat = re.compile(r'id=(\d+)')
res = id_pat.search(response.url)
url_id = res.group(1)
# 保存数据
sql = 'insert into ceshi(url_id,position_name,location,p_type,p_number,duty,requirement,crawl_time) VALUES(%s,%s,%s,%s,%s,%s,%s,%s) ' \
'on duplicate key update position_name=values(position_name)'
data = [url_id,postion_name,location,p_type,p_number,duty,requirement,crawl_time]
print(postion_name)
mydb.execute(sql,data)
def getPage():
for i in range(0,2920 + 1,10):
fullurl = base_url % i
response = requests.get(fullurl,headers=headers)
html = response.text
# 获取详情页链接地址
html = BeautifulSoup(html,'lxml')
tr_list = html.select('table.tablelist tr')[1:-1]
for tr in tr_list:
detail_link = tr.select('td > a')[0].get('href')
detail_link = 'http://hr.tencent.com/' + detail_link
# 发起详情页请求
parse_detail(detail_link)
if __name__ == '__main__':
mydb = Mydb()
getPage()
转载于:https://www.cnblogs.com/luwanhe/p/9502826.html