python爬取新浪微博数据中心_爬取51job网站的数据分析

# -*- coding:utf-8 -*-

# @Time : 2020-11-10 20:57

# @Author : BGLB

# @Software : PyCharm

import csv

from decimal import Decimal

import hashlib

import json

import logging

import logging.config

import os

import random

import re

import time

from urllib import parse

from lxml import html

from requests import get

etree = html.etree

headers = {

"Host": "search.51job.com",

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/10.5.4043.400",

}

def time_logging(func):

""" 记录函数运行时间的装饰器 :param func: 需要记录的函数名 :return: """

def wrapper(*args, **kw):

start_time = time.time()

func_result = func(*args, **kw)

runtime = time.time()-start_time

if runtime < 60:

runtime = "{:.2f}s".format(runtime)

elif runtime < 3600:

runtime = "{:.2f}m".format(runtime/60)

else:

runtime = "{:.2f}h".format(runtime/3600)

content = '[{0:^15}] - 运行时间 - [{1:^6}]'.format(func.__name__, runtime)

# print(content)

logging.info(content)

with open("./log/runtime.log", 'a', encoding='utf8') as f:

f.writelines(content+'\n')

return func_result

return wrapper

def search_job(job_key, page_num=1):

""" 搜索上海 广州 深圳 武汉 四个城市的岗位信息 一页有五十个岗位, """

url = "https://search.51job.com/list/020000%252C030200%252C040000%252C180200,000000,0000,00,9,99,{},2,{}.html"

response = get(url.format(parse.quote(

parse.quote(job_key)), page_num), headers=headers)

html = response.content.decode(response.encoding)

eroot = etree.HTML(html)

table_list = eroot.xpath('//script[@type="text/javascript"]')

# print(table_list[2].text)

# print(table_list[2].text.split("=",1)[-1])

json_str = json.loads(table_list[2].text.split("=", 1)[-1])

return json_str

@time_logging

def parse_job_msg(search_result):

""" 解析搜索到的岗位信息 解析一个列表 """

print(

"-------------正在解析第{}个页面数据--------------".format(search_result["curr_page"]))

job_msg_list = search_result["engine_search_result"] # 一页50个岗位

csv_list = []

for job_msg in job_msg_list:

# 工作id

jobid = job_msg["jobid"]

# 公司id

coid = job_msg["coid"]

# 工作url

job_href = job_msg["job_href"]

if job_href.split("/")[2].split(".")[0] == "jobs":

job_detail_str = get_job_msg(job_href)

else:

pattern = re.compile(r']+>', re.S)

job_detail_str = pattern.sub(

'', get_51rz_json("job_detail", {"jobid": jobid}))

# 工作名称

job_name = job_msg["job_name"]

# 公司url

co_href = job_msg["company_href"]

# 公司名称

co_name = job_msg["company_name"]

# 薪资情况 处理成 最高 最低 平均值

money = job_msg["providesalary_text"]

# 工作地点

workarea = job_msg["workarea_text"]

# 公司类型

co_type = job_msg["companytype_text"]

# 发布时间

update_time = job_msg["issuedate"]

# 工作福利

jobwelf = job_msg["jobwelf"]

if money == "" or money is None:

logging.error("{}的工作薪资{}获取失败".format(job_href, money))

continue

# 'attribute_text': ['上海-闵行区', '1年经验', '大专', '招2人']

job_attr = job_msg["attribute_text"]

job_po_tmp = job_year_tmp = ""

job_education = "不限"

for x in job_attr:

if '招' in x:

job_po_tmp = x

if '经验' in x:

job_year_tmp = x

if x in "高中大专本科博士硕士":

job_education = x

panter = re.compile(r'\d+')

if len(panter.findall(job_po_tmp)) > 0:

job_po = int(panter.findall(job_po_tmp)[0])

else:

job_po = 0

if len(panter.findall(job_year_tmp)) > 0:

job_year = int(panter.findall(job_year_tmp)[0])

else:

job_year = 0

# 公司人数

co_people = job_msg["companysize_text"]

# 公司经营范围

co_jx = job_msg['companyind_text']

ss_s = money.split("-")

if len(ss_s) < 2:

money_min = money_max = 0

else:

money_min, money_max = parse_money(money)

csv_dict = {

"职位名称": job_name,

"最低薪资(千/月)": money_min,

"最高薪资(千/月)": money_max,

"招聘人数": job_po,

"工作经验(年)": job_year,

"最低学历": job_education,

"工作地点": workarea.split("-")[0],

"工作福利": jobwelf,

"职位描述和详细条件": job_detail_str,

"公司名称": co_name,

"公司类型": co_type,

"公司人数": co_people,

"公司经营范围": co_jx,

"职位详情url": job_href,

"公司详情url": co_href,

"发布时间": update_time,

}

csv_list.append(csv_dict)

return csv_list

def parse_money(money_text):

money_min = money_max = 0

ss_s = money_text.split("-")

if len(ss_s) >= 2:

money_min = Decimal(ss_s[0])

money_max = Decimal(ss_s[1].split("/")[0][:-1])

if money_text.split('/')[0][-1] == "万":

money_min = 10*money_min

money_max = 10*money_max

if money_text.split('/')[-1] == "年":

money_max /= 12

money_min /= 12

return [money_min.quantize(Decimal("0.00")), money_max.quantize(Decimal("0.00"))]

def init_params(oparams):

""" 通过对js的解析 复写出初始化查询参数的方法 """

key = "tuDmheJQBlgy&Sm300l8xK^X4NzFYBcrN8@YLCret$fv1AZbtujg*KN^$YnUkh"

keyindex = random.randint(4, 40)

sParams = json.dumps(oparams)

md5 = hashlib.md5()

md5.update(("coapi"+sParams+str(key[keyindex:keyindex+15])).encode("utf8"))

sign = md5.hexdigest()

# print(md5.hexdigest())

return {

"key": keyindex,

"sign": sign,

"params": sParams

}

@time_logging

def get_51rz_json(interface: str, params: dict):

""" 针对对51rz.51job 的接口进行封装 查询工作列表 job_list 查询工作详情 job_detail {"jobid":126817691} 查询公司列表 commpany_list 查询公司详情 commpany_detail {"coid":} 查询工作条件 job_condition 查询工作时间表 job_time_table """

url_interface = {

"job_list": "https://coapi.51job.com/job_list.php",

"job_detail": "https://coapi.51job.com/job_detail.php",

"commpany_list": "https://coapi.51job.com/co_list.php",

"commpany_detail": "https://coapi.51job.com/job_company.php",

"job_condition": "https://coapi.51job.com/job_condition.php", # 工作条件

"job_time_table": "https://coapi.51job.com/job_schedule.php", # 工作时间表

}

header = {

"Host": "coapi.51job.com",

"Referer": "https://51rz.51job.com/",

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"

}

url = url_interface[interface]

res = get(url, init_params(params), headers=header)

# print(res.url)

res_str = res.content.decode("utf8")

filename = "{}".format(interface)

for x in params.values():

filename += str("_"+x)

res_json = res_str.split("(", 1)[-1][0:-1]

res_dict = dict(json.loads(res_json))

res_dict["html_url"] = res.url

write_file(filename, "json", res_dict)

# print(res_dict["resultbody"]["jobinfo"])

return res_dict["resultbody"]["jobinfo"]

@time_logging

def get_job_msg(job_detail_url):

""" 工作职位描述和详细条件 """

try:

job_detail_res = get(job_detail_url, headers=headers)

html = job_detail_res.content

eroot = etree.HTML(html)

job_name = eroot.xpath(

"/html/body/div[3]/div[2]/div[2]/div/div[1]/h1[1]/text()")[0]

co_name = eroot.xpath(

'/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]/@title')[0]

jobid = eroot.xpath('//*[@id="hidJobID"]/@value')[0]

_content = eroot.xpath(

'//div[@class="tCompany_center clearfix"]//text()')

except Exception as e:

logging.error("解析[{}]-失败- {}".format(job_detail_url, e))

return ""

filename = "{0}-{1}-{2}".format(job_name, co_name,

jobid).replace("(", "").replace(")", "").replace("/", "_").replace("*", "")

# print(_content)

# write_file(filename, "html", _content)

# 工作职位描述和详细条件

job_msg_str = eroot.xpath("//div[@class='bmsg job_msg inbox']/p/text()")

# 简单的数据清洗

for i in range(len(job_msg_str)):

job_msg_str[i] = "".join(job_msg_str[i].split())

return "".join(job_msg_str)

def write_file(filename, fileext, datas):

""" 写入文件 """

fileext_ignore = ["html", "log"] # 忽略输出的文件后缀

if not os.path.exists("./data/{}".format(fileext)):

os.makedirs("./data/{}".format(fileext))

filenames = "{0}.{1}".format(filename, fileext).replace(

"/", "_").replace("\\", "_")

filepath = "./data/{0}/{1}".format(fileext, filenames)

is_write = os.path.exists(filepath)

try:

with open(filepath, 'a', encoding="utf8", newline="") as f:

if fileext not in fileext_ignore:

print("正在写入文件-[{0}].....".format(filenames))

if fileext == "csv":

if 'dict' in str(type(datas[0])):

header = [x for x in datas[0].keys()]

# print(type(header), header)

# 提前预览列名,当下面代码写入数据时,会将其一一对应。

writer = csv.DictWriter(f, fieldnames=header)

if not is_write:

writer.writeheader() # 写入列名

writer.writerows(datas) # 写入数据

elif 'list' in str(type(datas[0])):

writer = csv.writer(f)

writer.writerows(datas)

else:

csv.writer(f).writerows(datas)

elif fileext == 'json':

json.dump(datas, f, ensure_ascii=False)

else:

f.writelines(datas)

if fileext not in fileext_ignore:

print("[{}]-共写入{}条数据".format(filenames, len(datas)))

logging.info(

"文件-[{0}]-写入成功,共有{1}条数据".format(filenames, len(datas)))

except Exception as e:

logging.error(

"文件-[{}]-写入出错:{},数据详情:数据{},数据长度{}".format(filenames, e, datas, len(datas)))

@time_logging

def parse_key(key, pages=1):

""" 爬取并处理某一个关键字的岗位信息 :param key: 关键字 :param pages: 爬取页数 :return: """

search_job_dict = search_job(key)

try:

total_page = int(search_job_dict["total_page"])

except TypeError as e:

total_page = 0

print("不存在与{}相关的岗位,请尝试换个关键字".format(key))

logging.error("不存在与{}相关的岗位,请尝试换个关键字,{}".format(key, e))

print("----------------与{}相关的岗位一共有{}个页面----------------".format(key, total_page))

if pages > total_page:

pages = total_page

for i in range(1, pages+1):

try:

job_json = search_job(key, i)

job_data = parse_job_msg(job_json)

write_file("{}_{}".format(key, i), "json", job_json)

write_file(key+"相关岗位", "csv", job_data)

except Exception as e:

logging.error("处理-{}-第{}个页面时出错-{}".format(key, i, e))

logging.info("{0}相关岗位信息爬取完毕!".format(key))

@time_logging

def main(key_list, count):

""" :param key_list: 关键字列表 :param count: 页码 :return: """

logging_init("./config/logconfig.json")

for key in key_list:

print("-----------------开始搜索{}相关的岗位信息------------------".format(key))

parse_key(key, count)

rename_dir() # 为了下次还能够保存数据。更改data 文件夹的名称 为data_{当前的时间戳}

print("列表关键字已爬取完毕!")

logging.info("列表关键字已爬取完毕!")

def rename_dir():

if os.path.exists("./data"):

try:

os.rename("./data", "./data_{}".format(int(time.time())))

except OSError as e:

logging.error("{}更改文件夹名称无管理员权限".format(e))

print("-------尝试更改data文件夹名称失败,请手动更改data文件夹名称-【防止下次爬取时数据重写】--------")

def logging_init(path, default_level=logging.INFO):

""" 日志初始化 :param path: 日志配置文件路径 :param default_level: 如果没有日志配置 默认的日志等级 :return: """

if not os.path.exists("./log"):

os.makedirs("./log")

if os.path.exists(path):

# print("ri")

with open(path, "r") as f:

config = json.load(f)

logging.config.dictConfig(config)

logging.getLogger("runtime")

else:

logging.basicConfig(level=default_level)

logging.info("{}不存在,使用默认的日志配置!".format(path))

if __name__ == '__main__':

keywords = ["python", "java", "c#", "web前端", "c/c++", "linux"]

pages = 300

main(keywords, pages)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
首先,使用Python爬取新浪微博评论数据需要以下步骤: 1. 登录微博开放平台,创建应用并获取App Key和App Secret。 2. 使用App Key和App Secret获取access_token。 3. 使用access_token获取微博的API接口。 4. 使用API接口获取微博评论数据。 5. 将数据存入数据库或写入csv文件中。 下面是一个简单的示例代码,演示如何使用Python爬取新浪微博评论数据并存入数据库中: ```python import os import sys import time import json import pymysql import requests from urllib.parse import quote_plus from datetime import datetime from dotenv import load_dotenv load_dotenv() app_key = os.getenv("APP_KEY") app_secret = os.getenv("APP_SECRET") access_token = os.getenv("ACCESS_TOKEN") # 数据库配置 db_host = os.getenv("DB_HOST") db_port = os.getenv("DB_PORT") db_user = os.getenv("DB_USER") db_password = os.getenv("DB_PASSWORD") db_name = os.getenv("DB_NAME") # 连接数据库 db = pymysql.connect(host=db_host, port=int(db_port), user=db_user, password=db_password, db=db_name, charset="utf8mb4") cursor = db.cursor() # 微博接口配置 base_url = "https://api.weibo.com/2/comments/show.json" max_count = 200 since_id = None max_id = None while True: # 构造API请求参数 params = { "access_token": access_token, "source": app_key, "count": max_count, "since_id": since_id, "max_id": max_id, } # 发送API请求 response = requests.get(base_url, params=params) if response.status_code != 200: print("Failed to get comments data from Weibo API.") sys.exit(1) # 解析API响应数据 data = json.loads(response.text) comments = data["comments"] # 遍历评论数据并存入数据库 for comment in comments: created_at = datetime.strptime(comment["created_at"], "%a %b %d %H:%M:%S +0800 %Y") text = comment["text"] user_id = comment["user"]["id"] user_name = comment["user"]["name"] mid = comment["mid"] sql = "INSERT INTO comments (created_at, text, user_id, user_name, mid) VALUES (%s, %s, %s, %s, %s)" try: cursor.execute(sql, (created_at, text, user_id, user_name, mid)) db.commit() except: db.rollback() # 更新API请求参数 if len(comments) == 0: break else: since_id = comments[0]["id"] max_id = comments[-1]["id"] # 控制API请求频率 time.sleep(5) ``` 以上代码中使用了dotenv库来读取环境变量,因此需要在项目根目录下创建一个名为“.env”的文件,并在其中添加以下配置项: ```text APP_KEY=your_app_key APP_SECRET=your_app_secret ACCESS_TOKEN=your_access_token DB_HOST=your_db_host DB_PORT=your_db_port DB_USER=your_db_user DB_PASSWORD=your_db_password DB_NAME=your_db_name ``` 注意:上述代码中的“comments”和“comments_data”均为示例数据库表名,需要根据实际情况修改。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值