可运行的完整项目,如有需要课私信联系
爬虫部分
import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
import csv
import pandas as pd
import os
import django
from selenium.webdriver.chrome.service import Service
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'boss直聘数据可视化分析.settings')
django.setup()
from myApp.models import *
class spider(object):
def __init__(self,type,page):
self.type = type
self.page = page
self.spiderUrl = "https://www.zhipin.com/web/geek/job?query=%s&city=100010000&page=%s"
def startBrower(self):
s = Service("chromedriver.exe")
browser = webdriver.Chrome(service=s)
# browser=webdriver.Chrome(executable_path='./chromedriver.exe')
return browser
def main(self,**info):
if info['page'] < self.page:return
brower = self.startBrower()
print('页表页面URL:' + self.spiderUrl % (self.type,self.page))
brower.get(self.spiderUrl % (self.type,self.page))
time.sleep(15)
# return
# //*[@id="wrap"]/div[2]/div[2]/div/div[1]/div[1]/ul
job_list = brower.find_elements(by=By.XPATH, value="//ul[@class='job-list-box']/li")
for index,job in enumerate(job_list):
try:
print("爬取的是第 %d 条" % (index + 1))
jobData = []
# title 工作名字
title = job.find_element(by=By.XPATH,
value=".//div[contains(@class,'job-title')]/span[@class='job-name']").text
# address 地址
addresses = job.find_element(by=By.XPATH,
value=".//div[contains(@class,'job-title')]//span[@class='job-area']").text.split(
'·')
address = addresses[0]
# dist 行政区
if len(addresses) != 1:dist = addresses[1]
else: dist = ''
# type 工作类型
type = self.type
# // *[ @ id = "wrap"] / div[2] / div[2] / div / div[1] / div[1] / ul / li[5] / div[1] / div / div[2] / ul
tag_list = job.find_elements(by=By.XPATH,
value=".//div[contains(@class,'job-info')]/ul[@class='tag-list']/li")
if len(tag_list) == 2:
educational = job.find_element(by=By.XPATH,
value=".//div[contains(@class,'job-info')]/ul[@class='tag-list']/li[2]&