话不多说,直接上代码
from selenium import webdriver
from lxml import etree
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time
import re
import json
class LaGou(object):
def __init__(self):
self.driver = webdriver.Chrome()
self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
self.poistion_li = []
def parse_page_list(self,page_souce):
html = etree.HTML(page_souce)
a_list = html.xpath("//a[@class='position_link']/@href")
for a in a_list:
self.request_detail_page(a)
time.sleep(1)
def request_detail_page(self,url):
self.driver.execute_script('window.open("%s")'%url)
self.driver.switch_to.window(self.driver.window_handles[1])
souce = self.driver.page_source
WebDriverWait(self.driver,timeout=10).until(
EC.presence_of_element_located((By.XPATH,'//div[@class="job-name"]'))
)
self.parse_detail_page(souce)
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[0])
def parse_detail_page(self,souce):
html=etree.HTML(souce)
job_name=html.xpath('//div[@class="job-name"]/@title')[0].strip()
job_request=html.xpath('//dd[@class="job_request"]/p')[0]
salary=job_request.xpath('./span[1]//text()')[0].strip()
salary=re.sub(r'[\s/]', '', salary)
area=job_request.xpath('./span[2]//text()')[0].strip()
area=re.sub(r'[\s/]', '', area)
expirence=job_request.xpath('./span[3]//text()')[0].strip()
expirence=re.sub(r'[\s/]', '', expirence)
poistion_y=html.xpath('//dd[@class="job-advantage"]/p//text()')[0].strip()
job_detail=html.xpath('//div[@class="job-detail"]//text()')
job_detail=''.join(job_detail).strip()
job_detail=re.sub(r'[\s/]', '', job_detail)
company_addr=html.xpath("//div[@class='work_addr']/a//text()")
company_addr='-'.join(company_addr)
item={"job_name": job_name, "salary": salary, "area": area, "exprience":
expirence, "position_y":
poistion_y, "job_detail":
job_detail, "company_addr":
company_addr}
self.poistion_li.append(item)
self.save_content(self.poistion_li)
def save_content(self, item):
json_content=json.dumps(item, ensure_ascii=False, indent=3)
with open('拉钩职位信息2.csv', 'a', encoding='utf-8') as f:
f.write(json_content + '\n')
def get_index(self):
self.driver.get(self.url)
self.driver.find_element_by_class_name('login').click()
self.driver.find_element_by_class_name('input').send_keys('xxxxxxx')
self.driver.find_element_by_xpath('//div[@class="input_item clearfix"]/input').send_keys('xxxx')
self.driver.find_element_by_xpath('//div[contains(@class,"sense_login_password")]').click()
time.sleep(5)
def run(self):
self.get_index()
while True:
page_souce = self.driver.page_source
detail_url = self.parse_page_list(page_souce)
self.request_detail_page(detail_url)
WebDriverWait(self.driver,timeout=10).until(
EC.presence_of_element_located((By.XPATH,'//div[@class="pager_container"]/span[last()]'))
)
try:
next_btn = self.driver.find_element_by_xpath(
'//div[@class="pager_container"]/span[last()]')
if "pager_next_disabled" in next_btn.get_attribute('class'):
break
else:
next_btn.click()
except Exception as e:
print(page_souce)
time.sleep(1)
if __name__ == '__main__':
spider =LaGou()
spider.run()