以前的项目,腾讯招聘网的网站是hr.tencent.com…
现在是careers.tencent.com,经过JavaScript渲染,打开网页源代码会出现下面情况:
所以先用selenium+Chromedriver的driver.page_source ,再xpath
代码段:
from selenium import webdriver
from lxml import etree
import csv
class tencent(object):
def __init__(self,url):
self.url=url
def save(self,content):
with open('tencent7.csv','a+',newline='')as file:
writer=csv.writer(file)
writer.writerow(content)#content得是列表
def drivers(self):
driver = webdriver.Chrome()
driver.get(url)
response=driver.page_source #获取源码
items=etree.HTML(response)
tables=items.xpath('//div[@class="recruit-list"]')
for table in tables:
jobName = table.xpath('.//a/h4/text()')[0]
jobDuty= table.xpath('.//a/p[2]/text()')[0]
jobAddress = table.xpath('.//p/span[2]/text()')[0]
releaseTime = table.xpath('.//p/span[4]/text()')[0]
content=[jobName,jobAddress,releaseTime,jobDuty]
# content='\n'.join(information) #列表转成字符串
self.save(content)
driver.quit()
if __name__=='__main__':
for i in range (1,2):
url='https://careers.tencent.com/search.html?&index=%s' % str(i)
main=tencent(url)
main.drivers()
print('*'*50)
不知道能不能结合scrapy,在Linux上爬