python3爬取51job加可视化

最新推荐文章于 2024-07-09 14:45:02 发布

ranranran52

最新推荐文章于 2024-07-09 14:45:02 发布

阅读量1k

点赞数 2

分类专栏： python 文章标签： python 爬虫 flask 数据可视化

本文链接：https://blog.csdn.net/ranranran52/article/details/109615772

版权

python 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

python3爬取51job加可视化

效果如下
在这里插入图片描述多进程爬取51job数据插入到mysql中

#____author:"xie"
#date:2020-10-19
# -*- coding: utf-8 -*-
import requests
import time,re
import pymysql
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from multiprocessing import Pool
from urllib import parse


time_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
req = requests.Session()
req.mount('http://', HTTPAdapter(max_retries=3))
req.mount('https://', HTTPAdapter(max_retries=3))

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'Host': 'search.51job.com',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-User': '?1',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ru;q=0.7',
    'Upgrade-Insecure-Requests': '1',
    'Cookie':'partner=www_baidu_com; guid=dee465ee11cc6ecc88e03ab6df40d83f; 51job=cenglish%3D0%26%7C%26; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60040000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21'
}

def file(url):
    job_name = re.findall(r'https://search.51job.com/list/040000,000000,0000,00,9,99,(.*?),', str(url), re.I)[0]
    job_name = parse.unquote(parse.unquote(job_name))
    data_row = {'rows': []}
    Current = 1
    Total = 1
    while Current <= Total:
        # url = 'https://search.51job.com/list/040000,000000,0000,00,9,99,{},2,{}.html'.format(search, Current)
        response = req.get(url=url, headers=headers, timeout=20)
        print('当前第{}页，url是：{}'.format(Current, url))
        req.close()
        data_file = []
        soup = BeautifulSoup(response.text, 'html.parser')
        list_url = soup.find_all(type="text/javascript")
        list_url = re.findall(r'"job_href":"(.*?)"', str(list_url), re.I)
        for i in list_url:
            url = str(i).replace(r'\/', '/')
            try:
                response = req.get(url=url, headers=headers, timeout=20)
                req.close()
                time.sleep(0.5)
                file = response.content.decode("gbk")
                soup = BeautifulSoup(file, 'html.parser')
                position = re.findall(r'<h1 title="(.*?)">', file, re.I)[0]
                salary = re.findall(r'</h1><strong>(.*?)</strong>', file, re.I)[0]
                company = soup.find_all('p', class_='cname')
                company_name = re.findall(r'title="(.*?)"', str(company), re.I)[0]
                msg_type = soup.find_all('p',class_='msg ltype')[0]
                msg_type = re.findall(r'title="(.*?)</p>', str(msg_type), re.I)[0]
                msg_type = str(msg_type).replace('<span>|</span>','').replace('|','').split('">')[-1].split()
                jtag = soup.find_all('span', class_='sp4')
                jtags = []
                for i in jtag:
                    jtags.append(i.string)
                job_msgs = []
                job_msg = soup.find_all('div', class_='bmsg job_msg inbox')[0]
                for i in job_msg:
                    if i.string == None:
                        continue
                    job_msgs.append(i.string)
                file_sql(company_name, position, salary, msg_type, jtags, job_msgs,url,job_name)
            except Exception as e:
                print(e)
        Current += 1
def file_sql(company_name, position, salary, msg_type, jtags, job_msgs,url,job_name):
    db = pymysql.connect("10.147.17.10","root","6340563","51job" )
    job_msgs = str(job_msgs).replace('[','').replace(']','').replace(r"', '",'')
    msg_type = str(msg_type).replace(r"'",'').replace('[','').replace(']','')
    jtags = str(jtags).replace(r"'",'').replace('[','').replace(']','')
    cursor = db.cursor()
    print('正在插入：{} 信息'.format(company_name))
    sql = "INSERT INTO 51_data(company_name, position, salary, msg_type, jtags, job_msgs,51_url,job_name)" \
          "VALUES('{}','{}','{}','{}','{}',{},'{}','{}');"\
        .format(company_name, position, salary, msg_type, jtags, job_msgs,url,job_name, url, job_name)
    try:
        cursor.execute(sql)
        db.commit()
    except:
        db.rollback()
    db.close()

if __name__ == '__main__':
    HomePage = 'https://search.51job.com/'
    search = parse.quote(parse.quote(input('请输入搜索词：')))
    url = HomePage+'list/040000,000000,0000,00,9,99,{},2,1.html'.format(search)
    response = req.get(url=url, headers=headers, timeout=20)
    total = re.findall(r'"total_page":"(\d+)"', response.text, re.I)
    total = int(total[0])
    pool = Pool(processes=10)
    tot_page = []
    for i in range(1, total):
        url = HomePage+'list/040000,000000,0000,00,9,99,{},2,{}.html'.format(search,i)
        tot_page.append(url)
    pool.map(file, tot_page)  # 多线程工作
    pool.close()
    pool.join()
    print('爬取完成！')

通过flask控制前端展示数据
效果
在这里插入图片描述

项目下载链接：
https://download.csdn.net/download/ranranran52/13104414

flask运行成功后地址是https://127.0.0.1:5000

ranranran52

关注

2
点赞
踩
17

收藏

觉得还不错? 一键收藏
0
评论
python3爬取51job加可视化

python3爬取51job加可视化效果如下多进程爬取51job数据插入到mysql中#____author:"xie"#date:2020-10-19# -*- coding: utf-8 -*-import requestsimport time,reimport pymysqlfrom bs4 import BeautifulSoupfrom requests.adapters import HTTPAdapterfrom multiprocessing import Pool
复制链接

扫一扫

专栏目录