import pymongo
from selenium import webdriver
from lxml import etree
import re
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class LagouSpider(object):
def __init__(self):
self.driver = webdriver.Chrome()
self.url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
self.positions = []
def connectDB(self, host, port):
"""
连接数据库
:param host: IP号
:param port: 端口号
:return:
"""
client = pymongo.MongoClient(host=host, port=port)
return client
def run(self):
self.driver.get(self.url)
while True:
# 获取page_source
source = self.driver.page_source
# 直到页面加载出来"//div[@class='pager_container']/span[last()]" 才执行下面的活动
WebDriverWait(driver=self.driver, timeout=10).until(
EC.presence_of_element_located((By.XPATH, "//div[@class='pager_container']/span[last()]"))
)
# 调用parse_list_page获取所有职位的超链接方法
self.parse_list_page(source)
time.sleep(2)
# 下一页
next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
# 判断是否是最后一页,如果是break,否则点击“下一页”
if "pager_next_disabled" in next_btn.get_attribute("class"):
break
else:
next_btn.click()
time.sleep(1)
def parse_list_page(self, source):
"""
获取所有职位的超链接
:param source:
:return:
"""
html = etree.HTML(source)
# 获取职位详情的超链接links列表
links = html.xpath("//a[@class='position_link']/@href")
# 遍历links列表
for link in links:
# 调用request_detail_page请求职位详情页方法
self.request_detail_page(link)
time.sleep(1)
def request_detail_page(self, url):
"""
请求职位详情页
:param url:请求职位详情页的url
:return:
"""
# self.driver.get(url)
self.driver.execute_script("window.open('%s')" % url)
self.driver.switch_to.window(self.driver.window_handles[1])
WebDriverWait(driver=self.driver, timeout=10).until(
EC.presence_of_element_located((By.XPATH, "//div[@class='job-name']/span[@class='name']"))
)
source = self.driver.page_source
# 调用parse_detail_page获取职位“字段”的方法
self.parse_detail_page(source)
# 关闭当前这个详情页
self.driver.close()
# 继续切换回职位列表页
self.driver.switch_to.window(self.driver.window_handles[0])
def parse_detail_page(self, source):
"""
获取职位详情页的“字段”
:param source:
:return:
"""
html = etree.HTML(source)
position_name = html.xpath("//span[@class='name']/text()")[0]
job_request = html.xpath("//dd[@class='job_request']//span")
salary = job_request[0].xpath("./text()")[0].strip()
city = job_request[1].xpath("./text()")[0].strip()
# 去掉无用的字符
city = re.sub(r"[\s/]", "", city)
work_year = job_request[2].xpath("./text()")[0].strip()
work_year = re.sub(r"[\s/]", "", work_year)
education = job_request[3].xpath("./text()")[0].strip()
education = re.sub(r"[\s/]", "", education)
desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
company_name = html.xpath("//h2[@class='fl']/text()")[0].strip()
position = {
'name': position_name,
'company_name': company_name,
'salary': salary,
'city': city,
'work_year': work_year,
'education': education,
'desc': desc
}
self.positions.append(position)
# print(position)
# 链接数据库
client = self.connectDB('127.0.0.1', 27017)
db = client['test']
p = db['lagou']
# 保存至数据库
p.insert(position)
print("*" * 40)
if __name__ == '__main__':
spider = LagouSpider()
spider.run()