python网站采集及可视化_Python抓取招聘网站+可视化,pythonscrapy,爬,起

推荐使用插件XPath Helper:

XPath Helper可以支持在网页点击元素生成xpath,整个抓取使用了xpath、正则表达式、消息中间件、多线程调度框架的chrome插件。

最主要的是分析xpath,这是一切的灵魂.

1.创建爬虫项目

scrapy startproject[项目名]

使用命令创建一个爬虫:

scrapy genspider yingcaiwang "域名"

运行爬虫命令

scrapy crawl knowlegde (name名)

创建启动start.py

2.目录结构介绍

2.1 items.py:用于存放爬虫爬取下来数据的模型

2.2 middlewares.py: 用于存放各种中间件的文件

2.3 pipelines.py: 用于将items的模型存储到本地磁盘中.

2.4 settings.py:本爬虫的一些配置文件信息(比如请求头、多久发送一次请求、ip代理池等)

2.5 scrapy.cfg: 项目中的配置文件

2.6 spiders包:以后所有的爬虫,都是存在这个里面.

3.开始我的爬虫

先看我的项目结构

直接代码

# -*- coding: utf-8 -*-

import scrapy

from qianchengwuyou.items import QianchengwuyouItem

class QiangchengSpider(scrapy.Spider):

name = 'qiangcheng'

allowed_domains = ['51job.com']

start_urls = ["https://search.51job.com/list/000000,000000,0000,00,9,99,%2520,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="]

def parse(self, response):

job_list = response.xpath("//div[@class='dw_table']//div[@class='el']/p[1]//span/a/@href").getall()

for i_items in job_list:

yield scrapy.Request(url=i_items,callback=self.jiexi_content,dont_filter=True)

#获取下一页的href

next_pages = response.xpath("//div[@class='p_in']//li[last()]/a/@href").get()

if next_pages:

yield scrapy.Request(url=next_pages,callback=self.parse,dont_filter=True)

def jiexi_content(self,response):

item = QianchengwuyouItem()

item['job_name'] = response.xpath("//div[@class='cn']/h1/text()").extract_first()

try:

item['job_money'] = response.xpath("//div[@class='cn']/strong/text()").extract_first()

except:

item['job_money'] = "面议"

item['job_company'] = response.xpath("//div[@class='cn']//a[1]/text()").extract_first()

#抓取所有工作地点和教育程度

job_all = response.xpath("//div[@class='cn']//p[2]/text()").getall()

#对抓取的列表进行分割取出,列表中包含了工作地点和工作经历和教育程度.

try:

item['job_place'] = job_all[0].strip()

except:

item['job_place'] = ""

# try:

# item['jineng'] = response.xpath("//div[@class='mt10']/p[2]/a/text()").extract()

# except:

# item['jineng'] =""

try:

item['job_experience'] = job_all[1].strip()

except:

item['job_experience'] = '无要求'

try:

item['job_education'] = job_all[2].strip()

except:

item['job_education'] = '不要求'

# try:

# item['import_zi'] = response.xpath('//div[@class="bmsg job_msg inbox"]/div[@class="mt10"]/p[2]/text()').get()

# print(item['import_zi'])

# except:

# item['import_zi'] = '不做要求'

#对所有岗位职责和任务进行提取,都爬取了的,爬取的岗位职责和任职要求

job_content = response.xpath("//div[@class='bmsg job_msg inbox']/p/text()").getall()

job_nz = []

#遍历循环取得岗位职责和任职要求的数据添加到job_nz[]中,在取出数据.

for job_nzs in job_content:

job_nz.append(job_nzs)

try:

item['job_nz'] = job_nz

except:

item['job_nz'] = "上岗安排"

yield item

在setting设置存入数据库和设置请求头:

在pipelines.py 存入数据库

import pymongo

from scrapy.utils.project import get_project_settings

settings = get_project_settings()

class QianchengwuyouPipeline:

def

init

(self):

pass

# 链接数据库

client = pymongo.MongoClient(host=settings[‘MONGO_HOST’], port=settings[‘27017’])

self.db = client[settings[‘MONGO_DB’]] # 获得数据库的句柄

self.coll = self.db[settings[‘MONGO_COLL’]] # 获得collection的句柄

#数据库登录需要账号密码的话

#self.db.authenticate(settings[‘MONGO_USER’], settings[‘MONGO_PSW’])

def process_item(self, item, spider):

pass

postItem = dict(item) # 把item转化成字典形式

self.coll.insert(postItem) # 向数据库插入一条记录

return item # 会在控制台输出原item数据,可以选择不写

def close(self):

close.client.close()

正在爬取数据

存入Mongodb

4.对数据做可视化

from pyecharts import options as opts

from pyecharts.charts import Pie

import pymongo

#连接mongodb

client = pymongo.MongoClient("127.0.0.1",port=27017)

#连接数据库

db = client['admin']

#连接表

mytable =db['qiangcheng']

#提取mogodb数据

chengdu ={"$and": [{"job_place":{"$regex":"成都"}},{"job_name":{"$regex":"数据采集"}}]}

beijing ={"$and": [{"job_place":{"$regex":"北京"}},{"job_name":{"$regex":"数据采集"}}]}

shanghai ={"$and": [{"job_place":{"$regex":"上海"}},{"job_name":{"$regex":"数据采集"}}]}

guangzhou ={"$and": [{"job_place":{"$regex":"广州"}},{"job_name":{"$regex":"数据采集"}}]}

shengzheng ={"$and": [{"job_place":{"$regex":"深圳"}},{"job_name":{"$regex":"数据采集"}}]}

num1 = 0

num2 =0

num3 =0

num4 = 0

num5 =0

for a in mytable.find(chengdu):

if "成都" in a["job_place"]:

num1 += 1

print(num1)

# print("***6***")

for b in mytable.find(beijing):

if "北京" in b["job_place"]:

num2 += 1

# print(num2)

# print("***6***")

for c in mytable.find(shanghai):

if "上海" in c["job_place"]:

num3 += 1

# print(num3)

# print("***6***")

for d in mytable.find(guangzhou):

if "广州" in d["job_place"]:

num4 += 1

# print(num4)

# print("***6***")

for f in mytable.find(shengzheng):

if "深圳" in f["job_place"]:

num5 += 1

# print(num5)

# print("***6***")

x_data =["成都","北京","上海","广州","深圳"]

y_data =[num1,num2,num3,num4,num5]

data_pir = list(zip(x_data,y_data))

pie =Pie()

pie.add(series_name="数据采集地区岗位",data_pair=data_pir)

pie.render("数据采集饼图.html")

5.分析数据饼图

6.分析大数据、数据采集、数据分析、平均工资,最大工资和最小工资分析图

from pyecharts import options as opts

from pyecharts.charts import Bar

import re

import pymongo

#连接mongodb

client = pymongo.MongoClient("127.0.0.1",port=27017)

#连接数据库

db = client['admin']

#连接表

mytable =db['qiangcheng']

#获取mogodb中的数据

#提取mogodb数据

def one_shujufenxi(shujufenxi):

#最高工资

slaray_high = []

for i in mytable.find(shujufenxi):

a = i["job_money"].split("-")[0]

slaray_high.append(float(a))

max_b = max(slaray_high)

min_c = min(slaray_high)

#设置初始值

avg1 = "{:.1f}".format(sum(slaray_high)/len(slaray_high))

ls = [max_b,min_c,float(avg1)]

return ls

def two_dashujugongchengshi(dashujugongchengshi):

# 最高工资

slaray_high = []

for i in mytable.find(dashujugongchengshi):

a = i["job_money"].split("-")[0]

slaray_high.append(float(a))

max_b = max(slaray_high)

min_c = min(slaray_high)

# 设置初始值

avg1 = "{:.1f}".format(sum(slaray_high) / len(slaray_high))

ls = [max_b, min_c, float(avg1)]

return ls

def three_shujucaiji(shujucaiji):

# 最高工资

slaray_high = []

for i in mytable.find(shujucaiji):

a = i["job_money"].split("-")[0]

try:

slaray_high.append(float(a))

except:

continue

slaray_high = sorted(slaray_high,reverse=True)

max_b = slaray_high[0]

min_c = slaray_high[-1]

# 设置初始值

avg1 = "{:.1f}".format(sum(slaray_high) / len(slaray_high))

ls = [max_b, min_c, float(avg1)]

return ls

def tu(y1,y2,y3):

gongzi = ['数据分析','大数据开发','数据采集']

bar = Bar()

bar.add_xaxis(xaxis_data=gongzi)

# 第一个参数是图例名称,第二个参数是y轴数据

bar.add_yaxis(series_name= "最高工资",y_axis=y1)

bar.add_yaxis(series_name="最低工资",y_axis=y2)

bar.add_yaxis(series_name="平均工资",y_axis=y3)

# 设置表的名称

bar.set_global_opts(title_opts=opts.TitleOpts(title='工资分析图',subtitle='工资单位:/月'),toolbox_opts=opts.ToolboxOpts(),)

bar.render("三个地区的数据分析.html")

if __name__ == '__main__':

shujufenxi = {"$and": [{"job_money": {"$regex": "/月"}}, {"job_name": {"$regex": "数据分析"}}]}

# print(one_shujufenxi(shujufenxi))

dashujugongchengshi = {"$and": [{"job_money": {"$regex": "/月"}}, {"job_name": {"$regex": "大数据工程师"}}]}

# print(two_dashujugongchengshi(dashujugongchengshi))

shujucaiji = {"$and": [{"job_money": {"$regex": "/月"}}, {"job_name": {"$regex": "数据采集"}}]}

# print(three_shujucaiji(shujucaiji))

# 最高

y1 = [one_shujufenxi(shujufenxi)[0],two_dashujugongchengshi(dashujugongchengshi)[0],three_shujucaiji(shujucaiji)[0]]

# 最低

y2 = [one_shujufenxi(shujufenxi)[1],two_dashujugongchengshi(dashujugongchengshi)[1],three_shujucaiji(shujucaiji)[1]]

# 平均

y3 = [one_shujufenxi(shujufenxi)[2],two_dashujugongchengshi(dashujugongchengshi)[2],three_shujucaiji(shujucaiji)[2]]

tu(y1,y2,y3)

还有分析哪一年的工资薪资图,其实思路都差不多.

最后把数据展示在网页上,我展示的是静态的网页.

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值