前程无忧——爬虫+数据分析

最新推荐文章于 2024-08-01 11:20:11 发布

小基基o_O

最新推荐文章于 2024-08-01 11:20:11 发布

阅读量8.6k

点赞数 23

分类专栏：数据分析

本文链接：https://blog.csdn.net/yellow_python/article/details/80795881

版权

数据分析专栏收录该内容

24 篇文章 1 订阅

订阅专栏

Catalog

1、项目介绍

主要目标

了解与IT行业的就业市场信息，打造自身的职业规划路线
根据市场的主流技术和需求，不断完善自身技能树

环境：

Win10、Python、Pycharm

技术

网页数据采集：scrapy、urllib、re
数据存储：MySQL
文字切割、词频统计：pandas、jieba、collections
数据升采样：pandas
数据离散化及透视：pandas
数据可视化及分析：Excel、PPT

2、爬虫算法（scrapy框架）

spider
- quanguo.py
items.py
middlewares.py（无修改）
pipelines.py
setting.py
run.py

spider/

quanguo.py

# -*- coding: utf-8 -*-
import scrapy
from urllib import parse
from ..items import QianchengwuyouItem
import re

class QuanguoSpider(scrapy.Spider):
    name = 'quanguo'
    # 起始页，两层URL编码
    start_urls = ['https://search.51job.com/list/000000,000000,0000,00,9,99,{},2,1.html'.format(parse.quote(parse.quote(input('输入搜索词：').strip())))]
    # 搜索页
    def parse(self, response):
        ls_url = response.xpath('//p/span/a').re('href="(.*?)"')
        for url in ls_url:
            yield scrapy.Request(url=url, callback=self.parse_detail)
        next_page = response.xpath('//*[@id="resultList"]/div[55]/div/div/div/ul/li[8]/a').re('href="(.*?)"')
        if next_page:
            yield scrapy.Request(url=next_page[0], callback=self.parse)
    # 详情页
    def parse_detail(self, response):
        item = QianchengwuyouItem()
        # 基本信息
        item['city'] = self.name
        item['name'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/h1/@title').extract_first()
        item['url'] = response.url
        item['salary'] = ''.join(response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/strong/text()').extract())
        region_exp_edu = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[2]/@title').extract_first().split('\xa0\xa0|\xa0\xa0')
        item['region'] = region_exp_edu[0]
        item['cp_name'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]/@title').extract_first()
        item['workplace'] = ''.join(response.xpath('/html/body/div[3]/div[2]/div[3]/div[2]/div/p').re('</span>(.+?)</p>')).strip()
        item['welfare'] = '|'.join(response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/div/div/span/text()').extract())
        # 详情页
        detail = response.xpath('/html/body/div[3]/div[2]/div[3]/div[1]/div').extract_first().lower()
        detail = re.sub('<.*?>', '', detail)
        item['detail'] = re.sub("""[^，。？！；：‘’“”、【】…0-9.\-a-zA-Z\u4e00-\u9fa5]+""", '', detail)
        # 经验、学历、招聘人数、发布日期
        item['exp'] = item['edu'] = item['demand'] = item['pubdate'] = item['skill'] = ''
        EDU = ['博士', '硕士', '本科', '大专',
               '中专', '中技', '高中', '初中及以下']
        for i in region_exp_edu:
            if '经验' in i:
                item['exp'] = i
            elif i in EDU:
                item['edu'] = i
            elif '招' in i:
                item['demand'] = i
            elif '发布' in i:
                item['pubdate'] = i
            else:
                item['skill'] = i
        # 公司信息
        CP_TYPE = ['民营公司', '上市公司', '事业单位', '国企', '外资（欧美）', '外资（非欧美）',
                   '创业公司', '政府机关', '合资', '外资', '合资', '外企代表处', '非营利组织']
        CP_SCALE = ['少于50人', '50-150人', '150-500人', '500-1000人',
                    '1000-5000人', '5000-10000人', '10000人以上']
        cp_info = response.xpath('/html/body/div[3]/div[2]/div[4]/div[1]/div[2]/p/text()').extract()
        item['cp_type'] = item['cp_scale'] = item['industry'] = ''
        for i in CP_TYPE:
            if i in cp_info:
                item['cp_type'] = i
                break
        for i in CP_SCALE:
            if i in cp_info:
                item['cp_scale'] = i
                break
        for i in cp_info:
            if i not in CP_TYPE and i not in CP_SCALE:
                item['industry'] = i
        return item

items.py

import scrapy

class QianchengwuyouItem(scrapy.Item):
    # define the fields for your item here like:
    ls = ['url', 'name', 'salary', 'region', 'workplace',
          'cp_name', 'cp_type', 'cp_scale', 'industry',
          'exp', 'edu', 'demand', 'pubdate', 'skill',
          'welfare',
          'detail']
    for fd in ls:
        exec(fd + '=scrapy.Field()')

pipelines.py

import pymysql

class QianchengwuyouPipeline(object):
    def open_spider(self, spider):
        self.db = pymysql.connect('localhost', 'root', 'yellow', charset='utf8', db='z_51job')
        self.cursor = self.db.cursor()
    def close_spider(self, spider):
        # self.db.commit()
        self.cursor.close()
        self.db.close()
    def process_item(self, item, spider):
        ls = list(item)
        sentence = 'insert quanguo (' + ','.join(ls) + ') values (' + ','.join(['"%({})s"'.format(field) for field in ls]) + ');'
        self.cursor.execute(sentence % item)
        self.db.commit()
        return item

setting.py

BOT_NAME = 'qianchengwuyou'

SPIDER_MODULES = ['qianchengwuyou.spiders']
NEWSPIDER_MODULE = 'qianchengwuyou.spiders'

# 浏览器伪装
USER_AGENT = 'User-Agent:Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0;'

# 爬虫协议
ROBOTSTXT_OBEY = False

# 数据存储
ITEM_PIPELINES = {
   'qianchengwuyou.pipelines.QianchengwuyouPipeline': 300,
}

run.py

from scrapy import cmdline
cmdline.execute(['scrapy', 'crawl', 'quanguo'])

3、数据分析算法（pandas）

core/
- conf.py
- db.py
- df.py
file/
- analysis
- filter/
  - EN_filter.txt
run.py

conf/

conf.py

import os, time
home = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
FILTER_EN = home + r'\file\filter\EN_filter.txt'

# 新建文件夹的前缀
ymd = time.strftime('%Y%m%d', time.localtime())
PREFIX = home + '\\file\\analysis\\' + ymd

if __name__ == '__main__':
    print(home)
    print(FILTER_EN)
    print(PREFIX)

db.py

import pymysql, pandas as pd
class Mysql:
    # 连接数据库,选择库
    def __init__(self):
        self.db = pymysql.connect('localhost', 'root', 'yellow', charset='utf8', db='z_51job')
        self.cursor = self.db.cursor()
    # 执行SQL
    def execute(self, sentence, arg=None):
        try:
            if arg:
                self.cursor.execute(sentence, arg)
            else:
                self.cursor.execute(sentence)
            self.db.commit()
        except Exception as error:
            print('\033[033m', error, '\033[0m')
    # 提交SQL
    def close(self):
        self.cursor.close()
        self.db.close()
    # 查询主要字段，排除薪资为空的记录
    def select_key(self):
        query = '''select
        name,salary,region,workplace,cp_name,cp_type,cp_scale,industry,exp,edu,demand,pubdate,welfare,detail
        from job where salary != '';'''
        df = pd.read_sql(query, self.db)
        return df

df.py

import re, jieba.posseg as jp, pandas as pd
from collections import Counter
from core.conf import (FILTER_EN, PREFIX)
# 数据分析
class Df:
    def __init__(self, data_frame, kw='python'):
        self.df = data_frame
        self.writer = pd.ExcelWriter(PREFIX + kw + '.xlsx')
    # 新增百分比列，并保存sheet
    def percentage(self, ls_tup, name):
        fields = [name, 'frequency']
        df = pd.DataFrame(ls_tup, columns=fields)
        df['percentage'] = df[df.columns[1]]/self.df.shape[0]
        df.sort_values(df.columns[1], ascending=False)
        df.to_excel(self.writer, sheet_name=name, index=False)
    # 分析并保存
    def write(self):
        # 中英文分词
        self.cut('detail')
        self.english('detail')
        # 薪资离散化、区间化
        self.salary()
        self.df.to_excel(self.writer, sheet_name='origin', index=False)
        self.pivot('salary_section', 'exp')
        self.pivot('salary_section', 'edu')
        self.pivot('salary_section', 'cp_scale')
        self.pivot('salary_section', 'cp_type')
        self.pivot('salary_section', 'region')
        self.spt('region')
        self.spt('industry')
        self.spt('welfare')
        self.spt('cp_name')
        self.writer.save()
    # 分词（detail）
    def cut(self, field):
        text = '|'.join([str(i) for i in self.df[field]])  # 衔接文本
        counter = Counter()
        posseg = jp.cut(text)
        for p in posseg:
            if len(p.word) > 2 and p.flag != 'eng':
                counter[p.flag + ' | ' + p.word] += 1
        most = counter.most_common(99)
        # 保存
        self.percentage(most, 'Cn%s' % field)
    # 英文分词（detail）
    def english(self, field):
        text = '|'.join([str(i) for i in self.df[field]])  # 衔接文本
        pat = '[a-z]+'
        re_ls = re.findall(pat, text)
        counter = Counter(re_ls)
        c1 = counter.most_common(199)
        # 英文停词过滤
        with open(FILTER_EN, encoding='utf-8') as f:
            prep = f.read().strip().split()
        c2 = [i for i in c1 if i[0] not in prep]
        # 保存
        self.percentage(c2, 'En%s' % field)
    # 切割（workplace，industry）
    def spt(self, field):
        text = '|'.join([str(i) for i in self.df[field]])  # 衔接文本
        ls = text.split('|')
        counter = Counter(ls)
        c = counter.most_common()
        # 保存
        self.percentage(c, 'CNT%s' % field)
    # 透视表(薪资分布)
    def pivot(self, field_1, field_2):
        name = field_1 + '-' + field_2
        pivot = self.df.pivot_table(
            values='name',
            index=field_1,
            columns=field_2,
            aggfunc='count')
        pivot.to_excel(
            self.writer,
            sheet_name=name)
    # salary
    def salary(self, n=4, start=5000, end=30000, step=1000):
        salary = self.df['salary']
        salary = ['0-0千/月' if i == '' else i for i in salary]
        # 切割“/”
        spt_ls = [i.split('/') for i in salary]
        # 年月→乘数
        period = {
            '年': 1/12,
            '月': 1,
            '天': 20,
            '小时': 160}
        for spt in spt_ls:
            spt[1] = period[spt[1]]
        # 万千→乘数
        units = {
            '万': 10000,
            '千': 1000,
            '元': 1}
        for spt in spt_ls:
            for k in units.keys():
                if k in spt[0]:
                    spt[0] = spt[0].replace(k, '')
                    spt.append(units[k])
                    break
        # 最大值，最小值
        for spt in spt_ls:
            if '-' in spt[0]:
                mi, ma = spt[0].split('-')
                spt[0] = ma
                spt.insert(0, mi)
            elif '以上' in spt[0]:
                mi = spt[0].split('以上')[0]
                spt[0] = mi
                spt.insert(0, mi)
            elif '以下' in spt[0]:
                mi = spt[0].split('以下')[0]
                spt[0] = mi
                spt.insert(0, mi)
            else:
                spt.insert(0, spt[0])
        # 文本转浮点数
        dfs = pd.DataFrame(spt_ls, columns=['min', 'max', 'period', 'unit'])
        row_min = list(pd.to_numeric(dfs['min']) * dfs['period'] * dfs['unit'])
        row_max = list(pd.to_numeric(dfs['max']) * dfs['period'] * dfs['unit'])
        # 升采样
        row_ls = [row_min, row_max]
        for j in range(1, n):
            row = [(row_min[i] + (row_max[i] - row_min[i]) * j / n) for i in range(len(spt_ls))]
            row_ls.append(row)
        # 重构DataFrame
        df_ls = [pd.concat([self.df, pd.DataFrame(row, columns=['salary_value'])], axis=1) for row in row_ls]
        # 合并DataFrame、升序
        df = pd.concat(df_ls)
        df.sort_values(by=['name', 'salary_value'], axis=0, ascending=True, inplace=True)
        # 薪资区间化（数据离散化）
        bin = [-1, 0] + list(range(start, end, step)) + [999999]  # 自定义区间划分
        open_left = ['(%d,%d]' % (bin[i], bin[i + 1]) for i in range(len(bin) - 1)]  # 长度要-1
        sec = pd.cut(df['salary_value'], bin, labels=open_left).to_frame(name='salary_section')
        # 合并
        self.df = pd.concat([df, sec], axis=1)

run.py

import os, sys
# 配置环境变量
BASE_PATH = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, BASE_PATH)
from core.db import Mysql
from core.df import Df
# 执行
if __name__ == '__main__':
    db = Mysql()
    df = Df(db.select_key())
    df.write()