import time
from selenium import webdriver
from lxml import etree
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class Boss_spydier(object):
def __init__(self):
self.driver = webdriver.Chrome()
self.base_url = "https://www.zhipin.com/job_detail/?query=python&scity=101010100&industry=&position="
self.positions = []
def run(self):
self.driver.get(self.base_url)
source = self.driver.page_source
self.parse_list_page(source)
def parse_list_page(self, source):
html = etree.HTML(source)
links = html.xpath(".//div[@class='info-primary']/h3[@class='name']/a/@href")
for link in links:
link = 'https://www.zhipin.com/' + link
self.request_detail_page(link)
time.sleep(1)
def request_detail_page(self, url):
self.driver.execute_script("window.open('%s')" % url)
self.driver.switch_to.window(self.driver.window_handles[1])
WebDriverWait(driver=self.driver, timeout=10).until(
EC.presence_of_element_located((By.XPATH, "//*[@id='main']"))
)
source = self.driver.page_source
self.parse_detail_page(source)
time.sleep(2)
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[0])
def parse_detail_page(self, source):
html = etree.HTML(source)
job_name = html.xpath(".//div[@class='name']/h1/text()")[0].strip()
salary = html.xpath(".//span[@class='badge']/text()")[0].strip()
city = html.xpath(".//div[@class='info-primary']/p/text()")[0].strip()
work_year = html.xpath(".//div[@class='info-primary']/p/text()")[1].strip()
education = html.xpath(".//div[@class='info-primary']/p/text()")[2].strip()
company = html.xpath(".//div[@class='info-company']/h3[@class='name']/a/text()")[0].strip()
# infos = html.xpath("div[@class='text']/text()")
# desc = ""
# yaoqiu = ""
# tuandui = ""
# for index, info in enumerate(infos):
# if info.startswith("【职位描述】"):
# for x in range(index + 1, len(infos)):
# desc = infos[x].strip()
# if desc.startswith("【"):
# break
# print(desc, "@@@@")
# elif info.startswith("【职位要求】"):
# for x in range(index + 1, len(infos)):
# yaoqiu = infos[x].strip()
# if yaoqiu.startswith("【"):
# break
# elif info.startswith("团队介绍"):
# for x in range(index + 1, len(infos)):
# tuandui = infos[x].strip()
# if tuandui.startswith("公司介绍"):
# break
position = {
'job_name': job_name,
'salary': salary,
'city': city,
'work_year': work_year,
'education': education,
'company': company,
}
self.positions.append(position)
print(position)
print("#" * 40)
if __name__ == '__main__':
boss_spider = Boss_spydier()
boss_spider.run()
待完善。。。。。。。。。。。。。。。