使用Selenium爬取腾讯招聘信息,并保存excel
- 代码比较简单,直接上源码
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from lxml import etree
import xlwt
class Tencent(object):
def __init__(self, url):
self.url = url
self.driver = webdriver.Chrome()
self.data_list = []
self.main()
# 返回页面内容
def get_content_by_selenium(self, url):
self.driver.get(url)
# 显示等待 直到div[@class="correlation-degree"]'加载出来
wait = WebDriverWait(self.driver, 20)
wait.until(EC.presence_of_all_elements_located((By.XPATH,'//div[@class="correlation-degree"]')))
return self.driver.page_source
# 解析页面 使用xpath获取元素
def parse_div(self, div_list):
for div in div_list:
data = {}
try:
job_title = div.xpath('.//h4/text()')[0]
job_address = div.xpath('.//a/p/span[2]/text()')[0]
job_type = div.xpath('.//a/p/span[3]/text()')[0]
job_time = div.xpath('.//a/p/span[4]/text()')[0]
job_detail = div.xpath('.//a/p[2]/text()')[0].replace('\n', '')
data['岗位名称'] = job_title
data['工作地点'] = job_address
data['工作类型'] = job_type
data['发布时间'] = job_time
data['职位描述'] = job_detail
print(data)
self.data_list.append(data)
except Exception:
pass
# 写入excel
def write_excel(self,filename, sheetname, data_list):
# 创建workbook
workbook = xlwt.Workbook(encoding='utf-8')
# 给工作表添加sheet表单
sheet = workbook.add_sheet(sheetname)
# 设置表头
head = []
for i in data_list[0].keys():
head.append(i)
# 将表头写入excel
for i in range(len(head)):
sheet.write(0, i, head[i])
# 写内容
i = 1
for item in data_list:
for j in range(len(head)):
sheet.write(i, j, item[head[j]])
i += 1
# 保存
workbook.save(filename)
print('写入成功')
def main(self):
for i in range(1,11):
html_str = self.get_content_by_selenium(self.url % i)
html = etree.HTML(html_str)
div_list = html.xpath('//div[@class="recruit-wrap recruit-margin"]/div')
self.parse_div(div_list)
self.write_excel('tencent.xls','job',self.data_list)
self.driver.close()
self.driver.quit()
if __name__ == '__main__':
base_url = 'https://careers.tencent.com/search.html?index=%s'
Tencent(base_url)