从mysql_link导入数据库链接
from lxml import etree
from urllib import parse
import requests
import json
from mysql_link import mysql_connect
def get_detail(detail_url,mysql):
user_agent = 'Mozilla/5.0 (Windows NTr 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
headers = {
'User-Agent': user_agent
}
response = requests.get(detail_url, headers=headers)
html_ele = etree.HTML(response.text)
tr = html_ele.xpath('//table[@class="tablelist textl"]/tr')
title = html_ele.xpath('//table[@class="tablelist textl"]/tr[1]/td')
print(title[0].text)
place = html_ele.xpath('//table[@class="tablelist textl"]/tr[2]/td[1]/text()')
print(place[0])
position = html_ele.xpath('//table[@class="tablelist textl"]/tr[2]/td[2]/text()')
person = html_ele.xpath('//table[@class="tablelist textl"]/tr[2]/td[3]/text()')
duty = html_ele.xpath('//table[@class="tablelist textl"]/tr[3]/td/ul/li/text()')
str_duty = ''
for du in duty:
str_duty += du
requirement = html_ele.xpath('//table[@class="tablelist textl"]/tr[4]/td/ul/li/text()')
str_requirement = ''
for re in requirement:
str_requirement+=re
sql = 'insert into tencent (title,place,posi,person,duty,requirement) values("{}","{}","{}","{}","{}","{}")'.format(title[0].text,place[0],position[0],person[0],str_duty,str_requirement)
print(sql)
mysql.mysql_do(sql)
def getPage():
my = mysql_connect()
url = 'https://hr.tencent.com/position.php?lid=&tid=&keywords=%E8%AF%B7%E8%BE%93%E5%85%A5%E5%85%B3%E9%94%AE%E8%AF%8D&start={}#a'
user_agent = 'Mozilla/5.0 (Windows NTr 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
headers = {
'User-Agent':user_agent
}
for i in range(0,5):
full_url = url.format(i*20)
print(full_url)
response = requests.get(full_url,headers=headers)
html_ele = etree.HTML(response.text)
tr_list = html_ele.xpath('//table[@class="tablelist"]/tr')
tr_list = tr_list[1:]
for i in tr_list[:-1]:
a_list = i.xpath('./td[1]/a/@href')
detail_url = 'https://hr.tencent.com/'+a_list[0]
print(detail_url)
get_detail(detail_url,my)
if __name__ == '__main__':
getPage()