from selenium import webdriver
from lxml import etree
import re
import time
from selenium.webdriver.common.by import By
import csv
import requests
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class LagouSpider(object):
def __init__(self):
self.driver_path = r'D:\cd\chromedriver.exe'
self.driver = webdriver.Chrome(executable_path=self.driver_path)
self.url='https://www.lagou.com/jobs/list_python%E7%88%AC%E8%99%AB?labelWords=&fromSearch=true&suginput='
self.positions=[]
self.stauts=0
def run(self):
self.driver.get(self.url)
while True:
WebDriverWait(driver=self.driver,timeout=10).until(
EC.presence_of_element_located((By.XPATH,"//div[@class='pager_container']/span[last()]"))
)
#print(self.driver.page_source)
source=self.driver.page_source
self.page_list_page(source)
try:
#获取下一页的
next_btn=self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
if "pager_next_disabled" in next_btn.get_attribute("class"):
break
else:
next_btn.click()
except:
print(source)
time.sleep(3)
def page_list_page(self,source):
html=etree.HTML(source)
links=html.xpath("//div[@class='p_top']//a/@href")
for link in links:
self.request_detail_page(link)
time.sleep(3)
def request_detail_page(self,url):
#需要open 详情页
#self.driver.get(url)
self.driver.execute_script("window.open('%s')"%url)
self.driver.switch_to_window(self.driver.window_handles[1])
WebDriverWait(driver=self.driver,timeout=10).until(
#//div[@class='job-name']//span[@class='name']/text() 不能这么写 这个地方不想普通的xpath 只找节点元素 不能找到text
EC.presence_of_element_located((By.XPATH,"//div[@class='job-name']/span[@class='name']"))
)
source=self.driver.page_source
self.parse_detail_page(source)
#保持只有2个页面 关闭他
self.driver.close()
#切回列表页
self.driver.switch_to_window(self.driver.window_handles[0])
def parse_detail_page(self,source):
html=etree.HTML(source)
position_name = html.xpath("//div[@class='job-name']//span[@class='name']/text()")[0]
#print(position_name)
job_request_spans = html.xpath("//dd[@class='job_request']//span")
salary = job_request_spans[0].xpath('.//text()')[0].strip()
#print(salary)
city = job_request_spans[1].xpath('.//text()')[0].strip()
city = re.sub(r"[\s/]", "", city)
#print(city)
work_years = job_request_spans[2].xpath('.//text()')[0].strip()
work_years = re.sub(r"[\s/]", "", work_years)
#print(work_years)
education = job_request_spans[3].xpath('.//text()')[0].strip()
education = re.sub(r"[\s/]", "", education)
#print(education)
desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
#print(desc)
company_name = html.xpath("//h2[@class='fl']/em/text()")[0].strip()
position = {
'name':position_name,
'salary':salary,
'city':city,
'work_years':work_years,
'education':education,
'desc':desc,
'company_name':company_name
}
self.positions.append(position)
print("*"*40)
#写入csv
print(position)
if self.stauts==0:
self.stauts = 1
self.save_csv(position)
else:
print('进来了')
self.save_csv1(position)
def save_csv(self,data):
headers=['name','salary','city','work_years','education','desc','company_name']
values=[]
values.append(data)
with open('job.csv','w',encoding='utf-8',newline='') as fp:
writer=csv.DictWriter(fp,headers)
writer.writeheader()
writer.writerows(values)
def save_csv1(self,data):
headers=['name','salary','city','work_years','education','desc','company_name']
values=[]
values.append(data)
with open('job.csv','a',encoding='utf-8',newline='') as fp:
writer = csv.DictWriter(fp, headers)
writer.writerows(values)
def read_csv(self,path='job1.csv'):
with open(path, 'r',encoding='utf-8') as fp:
readers = csv.DictReader(fp)
for reader in readers:
print(reader)
print(reader['name'])
print(reader['desc'])
if __name__=="__main__":
spider=LagouSpider()
spider.run()
#print(spider.positions)