python爬取新浪微博数据中心_爬取51job网站的数据分析

最新推荐文章于 2022-05-08 15:59:34 发布

weixin_39732716

最新推荐文章于 2022-05-08 15:59:34 发布

阅读量347

点赞数

文章标签： python爬取新浪微博数据中心

# -*- coding:utf-8 -*-

# @Time : 2020-11-10 20:57

# @Author : BGLB

# @Software : PyCharm

import csv

from decimal import Decimal

import hashlib

import json

import logging

import logging.config

import os

import random

import re

import time

from urllib import parse

from lxml import html

from requests import get

etree = html.etree

headers = {

"Host": "search.51job.com",

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/10.5.4043.400",

}

def time_logging(func):

""" 记录函数运行时间的装饰器 :param func: 需要记录的函数名 :return: """

def wrapper(*args, **kw):

start_time = time.time()

func_result = func(*args, **kw)

runtime = time.time()-start_time

if runtime < 60:

runtime = "{:.2f}s".format(runtime)

elif runtime < 3600:

runtime = "{:.2f}m".format(runtime/60)

else:

runtime = "{:.2f}h".format(runtime/3600)

content = '[{0:^15}] - 运行时间 - [{1:^6}]'.format(func.__name__, runtime)

# print(content)

logging.info(content)

with open("./log/runtime.log", 'a', encoding='utf8') as f:

f.writelines(content+'\n')

return func_result

return wrapper

def search_job(job_key, page_num=1):

""" 搜索上海广州深圳武汉四个城市的岗位信息一页有五十个岗位, """

url = "https://search.51job.com/list/020000%252C030200%252C040000%252C180200,000000,0000,00,9,99,{},2,{}.html"

response = get(url.format(parse.quote(

parse.quote(job_key)), page_num), headers=headers)

html = response.content.decode(response.encoding)

eroot = etree.HTML(html)

table_list = eroot.xpath('//script[@type="text/javascript"]')

# print(table_list[2].text)

# print(table_list[2].text.split("=",1)[-1])

json_str = json.loads(table_list[2].text.split("=", 1)[-1])

return json_str

@time_logging

def parse_job_msg(search_result):

""" 解析搜索到的岗位信息解析一个列表 """

print(

"-------------正在解析第{}个页面数据--------------".format(search_result["curr_page"]))

job_msg_list = search_result["engine_search_result"] # 一页50个岗位

csv_list = []

for job_msg in job_msg_list:

# 工作id

jobid = job_msg["jobid"]

# 公司id

coid = job_msg["coid"]

# 工作url

job_href = job_msg["job_href"]

if job_href.split("/")[2].split(".")[0] == "jobs":

job_detail_str = get_job_msg(job_href)

else:

pattern = re.compile(r']+>', re.S)

job_detail_str = pattern.sub(

'', get_51rz_json("job_detail", {"jobid": jobid}))

# 工作名称

job_name = job_msg["job_name"]

# 公司url

co_href = job_msg["company_href"]

# 公司名称

co_name = job_msg["company_name"]

# 薪资情况处理成最高最低平均值

money = job_msg["providesalary_text"]

# 工作地点

workarea = job_msg["workarea_text"]

# 公司类型

co_type = job_msg["companytype_text"]

# 发布时间

update_time = job_msg["issuedate"]

# 工作福利

jobwelf = job_msg["jobwelf"]

if money == "" or money is None:

logging.error("{}的工作薪资{}获取失败".format(job_href, money))

continue

# 'attribute_text': ['上海-闵行区', '1年经验', '大专', '招2人']

job_attr = job_msg["attribute_text"]

job_po_tmp = job_year_tmp = ""

job_education = "不限"

for x in job_attr:

if '招' in x:

job_po_tmp = x

if '经验' in x:

job_year_tmp = x

if x in "高中大专本科博士硕士":

job_education = x

panter = re.compile(r'\d+')

if len(panter.findall(job_po_tmp)) > 0:

job_po = int(panter.findall(job_po_tmp)[0])

else:

job_po = 0

if len(panter.findall(job_year_tmp)) > 0:

job_year = int(panter.findall(job_year_tmp)[0])

else:

job_year = 0

# 公司人数

co_people = job_msg["companysize_text"]

# 公司经营范围

co_jx = job_msg['companyind_text']

ss_s = money.split("-")

if len(ss_s) < 2:

money_min = money_max = 0

else:

money_min, money_max = parse_money(money)

csv_dict = {

"职位名称": job_name,

"最低薪资(千/月)": money_min,

"最高薪资(千/月)": money_max,

"招聘人数": job_po,

"工作经验(年)": job_year,

"最低学历": job_education,

"工作地点": workarea.split("-")[0],

"工作福利": jobwelf,

"职位描述和详细条件": job_detail_str,

"公司名称": co_name,

"公司类型": co_type,

"公司人数": co_people,

"公司经营范围": co_jx,

"职位详情url": job_href,

"公司详情url": co_href,

"发布时间": update_time,

}

csv_list.append(csv_dict)

return csv_list

def parse_money(money_text):

money_min = money_max = 0

ss_s = money_text.split("-")

if len(ss_s) >= 2:

money_min = Decimal(ss_s[0])

money_max = Decimal(ss_s[1].split("/")[0][:-1])

if money_text.split('/')[0][-1] == "万":

money_min = 10*money_min

money_max = 10*money_max

if money_text.split('/')[-1] == "年":

money_max /= 12

money_min /= 12

return [money_min.quantize(Decimal("0.00")), money_max.quantize(Decimal("0.00"))]

def init_params(oparams):

""" 通过对js的解析复写出初始化查询参数的方法 """

key = "tuDmheJQBlgy&Sm300l8xK^X4NzFYBcrN8@YLCret$fv1AZbtujg*KN^$YnUkh"

keyindex = random.randint(4, 40)

sParams = json.dumps(oparams)

md5 = hashlib.md5()

md5.update(("coapi"+sParams+str(key[keyindex:keyindex+15])).encode("utf8"))

sign = md5.hexdigest()

# print(md5.hexdigest())

return {

"key": keyindex,

"sign": sign,

"params": sParams

}

@time_logging

def get_51rz_json(interface: str, params: dict):

""" 针对对51rz.51job 的接口进行封装查询工作列表 job_list 查询工作详情 job_detail {"jobid":126817691} 查询公司列表 commpany_list 查询公司详情 commpany_detail {"coid":} 查询工作条件 job_condition 查询工作时间表 job_time_table """

url_interface = {

"job_list": "https://coapi.51job.com/job_list.php",

"job_detail": "https://coapi.51job.com/job_detail.php",

"commpany_list": "https://coapi.51job.com/co_list.php",

"commpany_detail": "https://coapi.51job.com/job_company.php",

"job_condition": "https://coapi.51job.com/job_condition.php", # 工作条件

"job_time_table": "https://coapi.51job.com/job_schedule.php", # 工作时间表

}

header = {

"Host": "coapi.51job.com",

"Referer": "https://51rz.51job.com/",

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"

}

url = url_interface[interface]

res = get(url, init_params(params), headers=header)

# print(res.url)

res_str = res.content.decode("utf8")

filename = "{}".format(interface)

for x in params.values():

filename += str("_"+x)

res_json = res_str.split("(", 1)[-1][0:-1]

res_dict = dict(json.loads(res_json))

res_dict["html_url"] = res.url

write_file(filename, "json", res_dict)

# print(res_dict["resultbody"]["jobinfo"])

return res_dict["resultbody"]["jobinfo"]

@time_logging

def get_job_msg(job_detail_url):

""" 工作职位描述和详细条件 """

try:

job_detail_res = get(job_detail_url, headers=headers)

html = job_detail_res.content

eroot = etree.HTML(html)

job_name = eroot.xpath(

"/html/body/div[3]/div[2]/div[2]/div/div[1]/h1[1]/text()")[0]

co_name = eroot.xpath(

'/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]/@title')[0]

jobid = eroot.xpath('//*[@id="hidJobID"]/@value')[0]

_content = eroot.xpath(

'//div[@class="tCompany_center clearfix"]//text()')

except Exception as e:

logging.error("解析[{}]-失败- {}".format(job_detail_url, e))

return ""

filename = "{0}-{1}-{2}".format(job_name, co_name,

jobid).replace("(", "").replace(")", "").replace("/", "_").replace("*", "")

# print(_content)

# write_file(filename, "html", _content)

# 工作职位描述和详细条件

job_msg_str = eroot.xpath("//div[@class='bmsg job_msg inbox']/p/text()")

# 简单的数据清洗

for i in range(len(job_msg_str)):

job_msg_str[i] = "".join(job_msg_str[i].split())

return "".join(job_msg_str)

def write_file(filename, fileext, datas):

""" 写入文件 """

fileext_ignore = ["html", "log"] # 忽略输出的文件后缀

if not os.path.exists("./data/{}".format(fileext)):

os.makedirs("./data/{}".format(fileext))

filenames = "{0}.{1}".format(filename, fileext).replace(

"/", "_").replace("\\", "_")

filepath = "./data/{0}/{1}".format(fileext, filenames)

is_write = os.path.exists(filepath)

try:

with open(filepath, 'a', encoding="utf8", newline="") as f:

if fileext not in fileext_ignore:

print("正在写入文件-[{0}].....".format(filenames))

if fileext == "csv":

if 'dict' in str(type(datas[0])):

header = [x for x in datas[0].keys()]

# print(type(header), header)

# 提前预览列名，当下面代码写入数据时，会将其一一对应。

writer = csv.DictWriter(f, fieldnames=header)

if not is_write:

writer.writeheader() # 写入列名

writer.writerows(datas) # 写入数据

elif 'list' in str(type(datas[0])):

writer = csv.writer(f)

writer.writerows(datas)

else:

csv.writer(f).writerows(datas)

elif fileext == 'json':

json.dump(datas, f, ensure_ascii=False)

else:

f.writelines(datas)

if fileext not in fileext_ignore:

print("[{}]-共写入{}条数据".format(filenames, len(datas)))

logging.info(

"文件-[{0}]-写入成功,共有{1}条数据".format(filenames, len(datas)))

except Exception as e:

logging.error(

"文件-[{}]-写入出错:{},数据详情:数据{},数据长度{}".format(filenames, e, datas, len(datas)))

@time_logging

def parse_key(key, pages=1):

""" 爬取并处理某一个关键字的岗位信息 :param key: 关键字 :param pages: 爬取页数 :return: """

search_job_dict = search_job(key)

try:

total_page = int(search_job_dict["total_page"])

except TypeError as e:

total_page = 0

print("不存在与{}相关的岗位,请尝试换个关键字".format(key))

logging.error("不存在与{}相关的岗位,请尝试换个关键字,{}".format(key, e))

print("----------------与{}相关的岗位一共有{}个页面----------------".format(key, total_page))

if pages > total_page:

pages = total_page

for i in range(1, pages+1):

try:

job_json = search_job(key, i)

job_data = parse_job_msg(job_json)

write_file("{}_{}".format(key, i), "json", job_json)

write_file(key+"相关岗位", "csv", job_data)

except Exception as e:

logging.error("处理-{}-第{}个页面时出错-{}".format(key, i, e))

logging.info("{0}相关岗位信息爬取完毕!".format(key))

@time_logging

def main(key_list, count):

""" :param key_list: 关键字列表 :param count: 页码 :return: """

logging_init("./config/logconfig.json")

for key in key_list:

print("-----------------开始搜索{}相关的岗位信息------------------".format(key))

parse_key(key, count)

rename_dir() # 为了下次还能够保存数据。更改data 文件夹的名称为data_{当前的时间戳}

print("列表关键字已爬取完毕！")

logging.info("列表关键字已爬取完毕！")

def rename_dir():

if os.path.exists("./data"):

try:

os.rename("./data", "./data_{}".format(int(time.time())))

except OSError as e:

logging.error("{}更改文件夹名称无管理员权限".format(e))

print("-------尝试更改data文件夹名称失败,请手动更改data文件夹名称-【防止下次爬取时数据重写】--------")

def logging_init(path, default_level=logging.INFO):

""" 日志初始化 :param path: 日志配置文件路径 :param default_level: 如果没有日志配置默认的日志等级 :return: """

if not os.path.exists("./log"):

os.makedirs("./log")

if os.path.exists(path):

# print("ri")

with open(path, "r") as f:

config = json.load(f)

logging.config.dictConfig(config)

logging.getLogger("runtime")

else:

logging.basicConfig(level=default_level)

logging.info("{}不存在,使用默认的日志配置!".format(path))

if __name__ == '__main__':

keywords = ["python", "java", "c#", "web前端", "c/c++", "linux"]

pages = 300

main(keywords, pages)

weixin_39732716

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python爬取新浪微博数据中心_爬取51job网站的数据分析

# -*- coding:utf-8 -*-# @Time : 2020-11-10 20:57# @Author : BGLB# @Software : PyCharmimport csvfrom decimal import Decimalimport hashlibimport jsonimport loggingimport logging.configimport osimport ra...
复制链接

扫一扫