一、爬虫部分(爬取了前程无忧和中华英才网,此处就拿前程无忧为例)
创建爬虫
scrapy startproject wyjob
scrapy genspider job‘www.51jobs.com'
爬取字段
数据储存
mysql
pipelines.py
# -*- coding: utf-8 -*-
import pymysql
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient
class WyjobPipeline:
def open_spider(self,spider):
#连接数据库
db_params = {
'host': '127.0.0.1',
'user': 'root',
'password': '1',
'database': 'yzhdb'
}
self.conn = pymysql.connect(**db_params)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
self.cursor.execute('insert into user(gs_name,zhiwei)VALUES ("{}","{}")'.format(item['gs_name'], item['zhiwei']))
self.conn.commit()
def close_spider(self, spider):
# 关闭游标和连接
self.cursor.close()
self.conn.close()
mongodb
pipelines.py
# -*- coding: utf-8 -*-
import pymysql
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient
class WyjobPipeline:
def open_spider(self,spider):
self.db = MongoClient('localhost',27017).yzh
self.collection = self.db.job
def process_item(self, item, spider):
self.collection.insert_one(dict(item))
return item
def close_spider(self,spider):
self.collection.close()
def open_spider(self,spider):
源码
job.py
# -*- coding: utf-8 -*-
import scrapy
from ..items import WyjobItem
class JobSpider(scrapy.Spider):
name = 'job'
allowed_domains = ["51job.com"]
start_urls = []
for a in range(1320):
url = "https://search.51job.com/list/000000,000000,0000,00,9,99,web,2," + str(a) + ".html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
start_urls.append(url)
def parse(self, response):
out_box = response.xpath('//div[@class="el"]')
# print(len(a))
for i in out_box:
parent = str((i.xpath('p[@class="t1 "]/span/a/@href').extract_first())).strip()
if not parent == "None":
print(parent)
yield scrapy.Request(parent, callback=self.d_parse)
def d_parse(self, response):
item = WyjobItem()
item['gs_name'] = response.xpath('//div[@class="tHeader tHjob"]//p/a/text()').extract_first()
item['zhiwei'] = response.xpath('//div[@class="tHeader tHjob"]//h1/text()').extract_first()
item['xinzi'] = str(response.xpath('//div[@class="tHeader tHjob"]//strong/text()').extract_first()).replace(
'None', '000')
item['where'] = response.xpath(
'//div[@class="tHeader tHjob"]//div[@class="cn"]/p[@class="msg ltype"]/text()[1]').extract_first().replace(
'\xa0', '')
item['jobjinyan'] = response.xpath(
'//div[@class="tHeader tHjob"]//div[@class="cn"]/p[@class="msg ltype"]/text()[2]').extract_first().replace(
'\xa0', '')
item['xueli'] = response.xpath(
'//div[@class="tHeader tHjob"]//div[@class="cn"]/p[@class="msg ltype"]/text()[3]').extract_first().replace(
'\xa0', '')
zhiweixinxis = response.xpath(
'//div[@class="tBorderTop_box"]/div[@class="bmsg job_msg inbox"]/p/text()').extract()
zhiweixinxi = ""
for n in zhiweixinxis:
zhiweixinxi += n + "\n"
item['zhiweixinxi'] = zhiweixinxi
gongsixinxis = response.xpath('//div[@class="tBorderTop_box"]/div[@class="tmsg inbox"]/text()').extract()
gongsixinxi = ""
for i in gongsixinxis:
gongsixinxi += i + "\n"
item['gongsixinxi'] = gongsixinxi
return item
itmes.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class WyjobItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
gs_name = scrapy.Field()
zhiwei = scrapy.Field()
xinzi = scrapy.Field()
where = scrapy.Field()
jobjinyan = scrapy.Field()
xueli = scrapy.Field()
zhiweixinxi = scrapy.Field()
gongsixinxi = scrapy.Field()
二、flume监听
实现监听(截图)
将数据通过xftp传入虚拟机,再cp到flume监听的文件里面
hdfs上的效果
三、hive数据分析
从hdfs传入hive
load data inpath '/source/logs/20200714/22/upload-.1594737964393' into table job_data ;
分析数据
insert into fenxi_data
select name,case
when if (regexp_extract(split(salary,'-')[1],'(.*?)万/月',1) is NULL or regexp_extract(split(salary,'-')[1],'(.*?)万/月',1) == '',false,true) then round(cast(regexp_extract(split(salary,'-')[1],'(.*?)万/月',1) as double),2)
when if (regexp_extract(split(salary,'-')[1],'(.*?)千/月',1) is NULL or regexp_extract(split(salary,'-')[1],'(.*?)千/月',1) == '',false,true) then round(cast(regexp_extract(split(salary,'-')[1],'(.*?)千/月',1) as double) / 12,2)
when if (regexp_extract(split(salary,'-')[1],'(.*?)万/年',1) is NULL or regexp_extract(split(salary,'-')[1],'(.*?)万/年',1) == '',false,true) then round(cast(regexp_extract(split(salary,'-')[1],'(.*?)万/年',1) as double) / 10,2)
else 0
end as max_salary,case
when if (regexp_extract(split(salary,'-')[1],'(.*?)万/月',1) is NULL or regexp_extract(split(salary,'-')[1],'(.*?)万/月',1) == '',false,true) then round(cast(split(salary,'-')[0] as double),2)
when if (regexp_extract(split(salary,'-')[1],'(.*?)千/月',1) is NULL or regexp_extract(split(salary,'-')[1],'(.*?)千/月',1) == '',false,true) then round(cast(split(salary,'-')[0] as double) / 12,2)
when if (regexp_extract(split(salary,'-')[1],'(.*?)万/年',1) is NULL or regexp_extract(split(salary,'-')[1],'(.*?)万/年',1) == '',false,true) then round(cast(split(salary,'-')[0] as double) / 10,2)
else 0
end as min_salary,case
when if (regexp_extract(split(salary,'-')[1],'(.*?)万/月',1) is NULL or regexp_extract(split(salary,'-')[1],'(.*?)万/月',1) == '',false,true) then round((round(cast(regexp_extract(split(salary,'-')[1],'(.*?)万/月',1) as double),2) + round(cast(split(salary,'-')[0] as double),2))/2,2)
when if (regexp_extract(split(salary,'-')[1],'(.*?)千/月',1) is NULL or regexp_extract(split(salary,'-')[1],'(.*?)千/月',1) == '',false,true) then round((round(cast(regexp_extract(split(salary,'-')[1],'(.*?)千/月',1) as double) / 12,2) + round(cast(split(salary,'-')[0] as double) / 12,2) )/2,2)
when if (regexp_extract(split(salary,'-')[1],'(.*?)万/年',1) is NULL or regexp_extract(split(salary,'-')[1],'(.*?)万/年',1) == '',false,true) then round((round(cast(regexp_extract(split(salary,'-')[1],'(.*?)万/年',1) as double) / 10,2) + round(cast(split(salary,'-')[0] as double) / 10,2))/2,2)
else 0
end as avg_salary,
address,
experience,
put_date
from job_data;
工资分析
create table gzfx(name VARCHAR(20),max_salary double,min_salary double,avg_salary double)charset utf8 collate utf8_general_ci;
insert into gzfx
select "数据分析",max(max_salary),min(min_salary),round(avg(avg_salary),2) from fenxi_data where min_salary != '0.0' and name like '%数据分析%' group by name like '%数据分析%' ;
insert into gzfx
select "大数据开发工程师",max(max_salary),min(min_salary),round(avg(avg_salary),2) from fenxi_data where min_salary != '0.0' and name like '%大数据开发工程师%' group by name like '%大数据开发工程师%' ;
insert into gzfx
select "数据采集",max(max_salary),min(min_salary),round(avg(avg_salary),2) from fenxi_data where min_salary != '0.0' and name like '%数据采集%' group by name like '%数据采集%' ;
岗位分析
create table gwfx(address varchar(55),num int)charset utf8 collate utf8_general_ci;
insert into gwfx
select "成都",sum(num) from (select address,count(* ) as num from fenxi_data where (name like '%数据分析%' or name like '%大数据开发工程师%' or name like '%数据采集%' or name like '%大数据%') and address like '%成都%' group by address )as a;
insert into gwfx
select "北京",sum(num) from (select address,count(* ) as num from fenxi_data where (name like '%数据分析%' or name like '%大数据开发工程师%' or name like '%数据采集%' or name like '%大数据%') and address like '%北京%' group by address )as a;
insert into gwfx
select "上海",sum(num) from (select address,count(* ) as num from fenxi_data where (name like '%数据分析%' or name like '%大数据开发工程师%' or name like '%数据采集%' or name like '%大数据%') and address like '%上海%' group by address )as a;
insert into gwfx
select "广州",sum(num) from (select address,count(* ) as num from fenxi_data where (name like '%数据分析%' or name like '%大数据开发工程师%' or name like '%数据采集%' or name like '%大数据%') and address like '%广州%' group by address )as a;
insert into gwfx
select "深圳",sum(num) from (select address,count(* ) as num from fenxi_data where (name like '%数据分析%' or name like '%大数据开发工程师%' or name like '%数据采集%' or name like '%大数据%') and address like '%深圳%' group by address )as a;
工作经验
create table gzjy(name varchar(55),max_salary double,min_salary double,avg_salary double)charset utf8 collate utf8_general_ci;
insert into gzjy
select "大数据",max(max_salary),min(min_salary),round(avg(avg_salary),2) from fenxi_data where (name like '%数据分析%' or name like '%大数据开发工程师%' or name like '%数据采集%' or name like '%大数据%') and (experience like '%1年%' or experience like '%2年%' or experience like '%3年%') and min_salary != '0.0';
岗位趋势
create table gwqs(put_date varchar,num int)charset utf8 collate utf8_general_ci;;
insert into gwqs
select put_date,count(put_date) as num from fenxi_data where (name like '%数据分析%' or name like '%大数据开发工程师%' or name like '%数据采集%' or name like '%大数据%') and put_date != '00-00' group by put_date;
四、sqoop导入mysql
sqoop export --connect jdbc:mysql://192.168.183.129:3306/yzh --username root --password P@ssw0rd --table gzfx --fields-terminated-by ',' --export-dir /user/hive/warehouse/jobs.db/gzfx
五、数据可视化
第一题:gzfx表
第二题:gwfx表:
第三题 gzjy表:
第四题gwqs表
六、实训小结
配置环境要细心,仔细看清楚每一个符号。。。
然后在hive数据分析时,理清思路再写
通过sqoop传到MySQL时先建对应的表的所有字段要一样,别忘了utf-8