实训要求
利用python编写爬虫程序,从招聘网站上爬取数据,将数据存入到MongoDB数据库中,将存入的数据作一定的数据清洗后做数据分析,最后将分析的结果做数据可视化。
爬虫部分
爬虫框架为Scrapy
job
# -*- coding: utf-8 -*-
import scrapy
from job51.items import Job51Item
class QcwySpider(scrapy.Spider):
name = 'job'
allowed_domains = ['https://serch.51job.com/']
start_urls = ['https://search.51job.com/list/000000,000000,0130%252C7501%252C7506%252C7502,01%252C32%252C38,9,99,%2520,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']
def parse(self, response):
all_urls = response.xpath("//*[@id='resultList']/div[@class='el']/p/span/a/@href").getall()
for url in all_urls:
yield scrapy.Request(url, callback=self.parse_html, dont_filter=True)
next_page = response.xpath("//div[@class='p_in']//li[last()]/a/@href").get()
if next_page:
yield scrapy.Request(next_page, callback=self.parse, dont_filter=True)
def parse_html(self, response):
item = Job51Item()
try:
jobname = response.xpath("//div[@class='cn']/h1/text()").getall()[0]
salary = response.xpath("//div[@class='cn']//strong/text()").get()
company = response.xpath("//div[@class='cn']//p[@class='cname']/a[1]/@title").get()
city = response.xpath("//div[@class='cn']//p[@class='msg ltype']/text()").getall()[0]
workyear = response.xpath("//div[@class='cn']//p[@class='msg ltype']/text()").getall()[1]
record = response.xpath("//div[@class='cn']//p[@class='msg ltype']/text()").getall()[2]
requirements = response.xpath("//div[@class='bmsg job_msg inbox']//text()").getall()
requirement_str = ""
for requirement in requirements:
requirement_str += requirement.strip()
skill = ""
keyword = response.xpath("//p[@class='fp'][2]/a/text()").getall()
for i in keyword:
skill += i + " "
except:
jobname = ""
salary = ""
company = ""
city = ""
workyear = ""
record = ""
requirement_str = ""
skill = ""
finally:
item["jobname"] = jobname
item["salary"] = salary
item["company"] = company
item["city"] = city
item["workyear"] = workyear
item["record"] = record
item["requirement"] = requirement_str
item["skill"] = skill
yield item
setting
BOT_NAME = 'job51'
SPIDER_MODULES = ['job51.spiders']
NEWSPIDER_MODULE = 'job51.spiders'
MONGODB_HOST='127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = '51job_hive'
MONGODB_DOCNAME = 'job51hive'
DOWNLOAD_DELAY = 1
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User_Agent' :'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'
}
ITEM_PIPELINES = {
'job51.pipelines.Job51Pipeline': 300,
}
pipeline
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.utils.project