获取北京python工作岗位
实现了mongodb数据库和csv文件
存在的问题:偶尔文件执行报错--Stale Element Reference Exception好像是JavaScript把网页给刷新了,那么操作的时候就会碰到Stale Element Reference Exception。所以这个异常并不是每次都会产生,也不是你想复现就能复现。
小白:希望大家可以回复解决办法和优化方案
from time import sleep
from random import uniform
import re
import csv
from h_selenium.lagou_save_mongodb import * # 自定义连接写入mongodb
from lxml import etree
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
class LagouSpider:
def __init__(self,):
self.driver = webdriver.Chrome()
self.driver.get('https://www.lagou.com/jobs/list_python?px=default&city=%E5%8C%97%E4%BA%AC#filterBox')
self.save_mongodb = SaveData() # 建立mondo连接
def run(self):
try:
WebDriverWait(self.driver, timeout=10).until(
EC.presence_of_element_located(
(By.XPATH, '//div[@class="pager_container"]/span[last()]')
)) # 等到页面元素加载
except TimeoutError:
print('列表页错误,重新获取中')
self.run()
next_url = self.driver.find_element(By.XPATH, '//div[@class="pager_container"]/span[last()]')
next_url_class = next_url.get_attribute('class')
text = self.driver.page_source # 页面转换str
html = etree.HTML(text)
job_urls = html.xpath('//a[@class="position_link"]/@href') # 获取详情页url
# return
for url in job_urls:
self.parse_job_html(url) # 解析详情页面
# 如果是最后一页 就跳出
if 'pager_next_disabled' in next_url_class:
print('爬取结束')
self.driver.quit()
else:
next_url.click() # 点击下一页
self.run() # 继续执行
# 打开新个窗口, 解析工作详情页面
def parse_job_html(self, url):
print(url)
self.driver.execute_script(r'window.open("{}")'.format(url)) # 打开新的窗口
self.driver.switch_to.window(self.driver.window_handles[1]) # 鼠标切换到新的窗口
try:
WebDriverWait(self.driver, timeout=10).until(
EC.presence_of_element_located((By.XPATH, '//div[@class="job-name"]/span[@class="name"]'))
)
text = self.driver.page_source
self.data_fetch(text, url) # 数据提取
except TimeoutError:
print('详情页获取失败,重新打开')
self.parse_job_html(url)
finally:
sleep(uniform(1, 6)) # 设置睡眠随机时间,防止被识别
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[0]) # 鼠标切换回来
def data_fetch(self, text, url):
html = etree.HTML(text)
job_name = html.xpath('//div[@class="job-name"]'
'/span[@class="name"]/text()')[0] # 岗位名称
def format_tool(_str):
format_s = re.sub(r'[\s /]', '', _str).strip()
return format_s
describe = html.cssselect('dd.job_request p span')
describe = [format_tool(i.text) for i in describe] # 岗位描述
salary, city, job_years, edu, = describe[:-1]
company_name = html.xpath('//h2[@class="fl"]//text()')[0].strip() # 公司名称
advantage = html.cssselect('dd.job-advantage p')[0].text # 职位诱惑
description_li = html.xpath('//dd[@class="job_bt"]/div//text()') # 岗位描述
list_format = ''
description_no_format = list_format.join(description_li)
description = format_tool(description_no_format)
address_li = html.xpath('//div[@class="work_addr"]//text()')[:-2] # 工作地址
address_no_format = list_format.join(address_li)
address = format_tool(address_no_format)
print(job_name)
print(salary, city, job_years, edu, company_name)
print('-' * 30)
print(advantage, )
print('-' * 30)
print(description)
print('-' * 30)
print(address)
print('=' * 60 + '\n')
data = {'job_name': job_name, 'company_name': company_name, 'salary': salary, 'city': city,
'job_years': job_years, 'edu': edu, 'advantage': advantage,
'description': description, 'address': address, 'url': url}
self.save_data_csv(data) # 保存csv文件
self.save_mongodb.insert_data(data) # 插入mongo数据库
# 写入csv头部标签文件
def save_title_csv(self):
file = open('lagou.csv', 'a', encoding='utf-8-sig', newline='')
data_title = ['岗位名称', '公司名称', '薪资', '城市', '工作年限',
'学历要求', '职位诱惑', '岗位要求', '公司地址', '网页地址']
writer = csv.DictWriter(file, data_title)
writer.writeheader()
file.close()
# 写入csv内容文件
def save_data_csv(self, data):
with open('lagou.csv', 'a', encoding='utf-8-sig', newline='') as f:
writer = csv.writer(f)
writer.writerow([i for i in data.values()])
print('='*20+'csv写入成功'+'='*20)
def start(self):
self.save_title_csv()
self.run()
if __name__ == '__main__':
go = LagouSpider()
go.start()
mongodb数据库写入
上面导入的lagou_save_mongodb.py文件
import pymongo
class SaveData(object):
def __init__(self):
self.client = pymongo.MongoClient(port=27017, host='localhost')
self.db = self.client.spider_demo # 连接或创建数据库
self.collection = self.db.spider_lagou # 连接或创建集合
def insert_data(self, data):
self.collection.insert(data)
print('=' * 20 + 'mongodb写入成功' + '=' * 20)
if __name__ == '__main__':
start = SaveData()
data = {'age': 8, 'name': '张三'} # 测试
start.insert_data(data)