侵删。。。。。
import requests
from bs4 import BeautifulSoup
import paramiko
import re
import time
from hdfs import *
import jsonpath
import json
import random
import os
import traceback
fileName = time.strftim("%m-%d")
file = open("g://job/"+fileName+"_boss","a",encoding="UTF-8")
print("开始爬取boss直聘")
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'}
try:
# 确定爬取几天内的数据:1天内和3天内****************************************************************8
aaa_boss = 1
#以后把这些加在最前边
hangye_boss_all = [150407,150408,150409,150410,150411,150412,150413,150414,100103,100101,100102,100104,100105,100106,100107,100108,100109,100110,100111,100112,100113,100114,100115,100116,100117,100118,100119,100120,100121,100122,100201,
100202,100203,100204,100205,100206,100208,210607,210608,210609,210601,210602,210603,210604,210405,210605,210606,290401,300108,300107,300106,300105,300104,300103,300102,300101,300207,300206,300205,300204,300203,300202,300201,300317,300316,
300315,300314,300313,300312,300311,100209,100210,100308,100307,100306,100305,100304,100303,100302,100301,100409,100408,100407,100406,100405,100404,100403,100402,100401,100511,100512,100513,100506,100507,100508,100509,100607,100606,100605,100604,100603,100602,
100601,100816,100815,100814,100813,100812,100811,100810,100809,100808,100807,100806,100805,100804,100803,100802,100801,100904,100903,100902,100901,101018,101017,101016,101015,101014,101013,101012,101011,101010,101009,101008,101007,101006,101005,101004,101003,101002,101001,101404,101403,101402,101401,
100707,100706,100705,100704,100703,100702,100701,101307,101306,101305,101304,101303,101302,101301,101202,101201,101101,110108,110107,110106,110105,110104,110103,110102,110101,110304,110303,110302,110401,120121,120120,120119,120118,
120117,120116,120115,120114,120113,120112,120111,120110,120109,120108,120107,120106,120105,120104,120103,120102,120101,120204,120203,120202,120201,120408,120407,120404,120403,120402,120401,120406,120405,120404,120403,120402,120401,
120610,120608,120607,120606,120605,120604,120603,120602,120601,120501,130120,130119,130118,130117,130116,130115,130114,130113,130112,130111,130110,130109,130108,130107,130106,130105,130104,130103,130102,
130101,130206,130205,130204,130203,130202, 130201, 130309, 130308, 130307,130306, 130305, 130304, 130303, 130302, 130301, 130405, 130404, 130403, 130402, 130401, 130501, 140114, 140113, 140112, 140111, 140110,
140109, 140108,140107,140106,140105,140104,140103,140102,140101,140206,140205,140204,140203,140202,140201,140506,140505,140504,140503,140502,140501,140601,140602,140603,140604,140605,140607,140608,140609,140610,140611,140401,140404,140405,140406,140407,140701,150107,
150106,150105,150104,150103,150102,150101,150403,150406,150108,150109,150110,150201,150202,150204,150205,150207,150208,150401,150209,150309,150308,150404,150402,
150307,150306,150305,150304,150303,150302,150301,150203,150502,150503,150504,150505,150506,150507,150601,140317,140316,140315,140314,140313,140312,140311,140310,140309,
140308,140307,140306,140305,140304,140303,140302,140301,160104,160103,160102,160101,140403,140402,160201,170108,170107,170106,170105,170104,170103,170102,170101,170306,170305,170304,170303,170302,170301,
170406,170405,170404,170403,170402,170401,170205,170204,170203,170202,170201,170207,170208,170209,170210,170211,170616,170617,170618,170619,170615,170614,170613,170612,170611,170610,170609,170608,170607,170606,170605,170604,170603,170602,170601,170501,180119,
180111,180118,180117,180116,180115,180114,180113,180112,180104,180103,180101,180204,180203,180202,180201,180304,180303,
180302,180301,180107,180406,180405,180105,180404,180403,180402,180102,180401,180110,180109,180108,180107,180106,180105,180104,180103,180102,180101,180703,180702,180701,
180802,180801,180601,230108,230107,230106,230105,230104,
230103,230102,230101,230110,230212,230211,230210,230209,230209,230208,230207,230206,230205,230204,230203,230202,230201,230301,190107,190106,190105,190104,190103,190102,190101,190204,190203,190202,190201,190313,190312,190311,190310,190309,
190308,190307,190306,190305,190304,190303,190302,190301,190411,190410,190409,190408,190407,190406,190405,190404,190403,190402,190401,190504,190503,190502,190501,190603,190602,190601,190707,190706,190705,190704,190703,190702,
190701,190801,210803,210802,210801,210112,210113,210114,210115,210116,210117,210118,210119,210120,210121,210122,210101,210102,210103,210104,210105,210106,210107,210108,210109,210110,210111,210201,210202,210305,210304,210303,210302,
210301,210404,210403,210402,210401,210504,210505,210506,210503,210502,210501,210701,250107,250106,250105,250104,
250103,250102,250101,250204,250203,250202,250201,250301,240117,240116,240115,240114,240113,240112,240111,240110,240109,240108,240107,240106,240105,240104,240103,240102,240101,240206,240205,240204,240203,240202,240201,240304,240303,240302,240301,240402,240401,240501,220103,220102,220101,220212,
220211,220210,220209,220208,220207,220206,220205,220204,220203,220202,220201,220303,220302,220301,220403,220402,220401,220505,220504,220503,220502,220501,220601,260110,260109,260108,260107,260106,260105,260104,260103,260102,260101,260203,260202,260201,260308,260307,260306,260305,260304,260303,
260302,260301,260404,260404,260402,260401,260501,270103,270102,270101,270201,280105,280104,280103,280102,280101,280202,280201,280301,290105,290106,290104,290103,290102,290101,290211,290210,290206,290207,290208,290209,290201,
290202,290203,290204,290205,290301,290305,290306,290307,290302,290303,290304,300310,300309,300308,300307,300306,300305,300304,300303,300302,300301,300406,300405,300404,300403,300402,300401,300510,300509,300508,300507,300506,300505,300504,300503,300502,300501,300606,300605,
300604,300603,300602,300601,300701,200101]
for hangye_boss_end in hangye_boss_all:
page_boss = 1
flag_zhonghua = True
while flag_zhonghua:
url_boss = ""
# 一天内
if aaa_boss == 1:
url_boss = "https://www.zhipin.com/c100010000-p" + str(hangye_boss_end) + "/?period=1&page=" + str(
page_boss) + "&ka=page-" + str(page_boss)
# 三天内
if aaa_boss == 3:
url_boss = "https://www.zhipin.com/c100010000-p" + str(hangye_boss_end) + "/?period=2&page=" + str(
page_boss) + "&ka=page-" + str(page_boss)
# 七天内
if aaa_boss == 7:
url_boss = "https://www.zhipin.com/c100010000-p" + str(hangye_boss_end) + "/?period=3&page=" + str(
page_boss) + "&ka=page-" + str(page_boss)
# 十五天内
if aaa_boss == 15:
url_boss = "https://www.zhipin.com/c100010000-p" + str(hangye_boss_end) + "/?period=4&page=" + str(
page_boss) + "&ka=page-" + str(page_boss)
try:
resp_boss = requests.get(url_boss, headers=headers)
except:
print("boss直聘爬取异常1")
continue
resp_boss.encoding = "utf-8"
hangye_boss = ""
print(page_boss)
bs_boss = BeautifulSoup(resp_boss.text, "lxml")
befor_split = bs_boss.find("title").get_text().replace("【全国","")
hangye_boss_name = befor_split[:befor_split.index("招聘")]
try:
url_page = bs_boss.find(lambda tag: tag.name == "div" and tag.get("class") == ["page"]).find(
lambda tag: tag.name == "a" and tag.get("class") == ["cur"]).get_text()
except:
break
if int(url_page) - page_boss != 0:
break
f_all_boss = bs_boss.find_all(lambda tag: tag.name == "div" and tag.get("class") == ["job-primary"])
for f_boss in f_all_boss:
# 工作信息
job_boss = f_boss.find(lambda tag: tag.name == "div" and tag.get("class") == ["info-primary"])
job_boss_url = "https://www.zhipin.com" + job_boss.find("a")["href"]
job_boss_name = job_boss.find(
lambda tag: tag.name == "div" and tag.get("class") == ["job-title"]).get_text()
job_boss_money = job_boss.find("span").get_text()
job_boss_aaa = str(job_boss.find("p"))
job_boss_aaa01 = job_boss_aaa.replace("<p>", "").replace("</p>", "").replace('<em class="vline"></em>',
",")
job_boss_aaa02 = job_boss_aaa01.split(",")
job_boss_addr = job_boss_aaa02[0]
job_boss_year = job_boss_aaa02[1]
job_boss_xueli = job_boss_aaa02[2]
# 获取发布时间
job_boss_date = ""
today_boss = time.strftime("%m-%d")
time_boss = f_boss.find(lambda tag: tag.name == "div" and tag.get("class") == ["info-publis"]).find("p")
if time_boss != None:
time_boss01 = time_boss.get_text()
time_boss02 = time_boss01.split("发布于")
if len(time_boss02[1].split(":")) > 1:
job_boss_date = today_boss
elif time_boss02[1] == "昨天":
month_day = today_boss.split("-")
month_boss = int(month_day[0])
day_boss = int(month_day[1])
if day_boss > 1:
job_boss_date = str(month_boss) + "-" + str(day_boss - 1)
else:
if month_boss - 1 in [1, 3, 5, 7, 8, 10]:
if month_boss == 1:
job_boss_date = "12-31"
else:
job_boss_date = str(month_boss - 1) + "-31"
else:
job_boss_date = str(month_boss - 1) + "-30"
else:
temp01 = time_boss02[1].split("月")
temp_month = temp01[0]
temp_day = temp01[1].split("日")[0]
job_boss_date = temp_month + "-" + temp_day
try:
resp_boss_job = requests.get(job_boss_url, headers=headers)
except:
print("boss直聘爬取异常2")
continue
resp_boss_job.encoding = "utf-8"
bs_boss_job = BeautifulSoup(resp_boss_job.text, "lxml")
job_boss_desc001 = bs_boss_job.find(lambda tag: tag.name == "div" and tag.get("class") == ["job-sec"])
job_boss_desc003 = None
if job_boss_desc001 != None:
job_boss_desc003 = job_boss_desc001.find(
lambda tag: tag.name == "div" and tag.get("class") == ["text"])
job_boss_desc002 = ""
if job_boss_desc001 != None and job_boss_desc003 != None:
job_boss_desc002 = job_boss_desc003.get_text()
else:
job_boss_desc002 = "自己打开网址看"
job_boss_desc = job_boss_desc002.replace('\n', "")
# 公司信息
company_boss = f_boss.find(lambda tag: tag.name == "div" and tag.get("class") == ["company-text"]).find(
"a")
company_boss_name = company_boss.get_text()
company_boss_url = "https://www.zhipin.com" + company_boss["href"]
# try:
# resp_boss_company = requests.get(company_boss_url, headers=headers)
# except:
# continue
# resp_boss_company.encoding = "utf-8"
#
# bs_boss_company = BeautifulSoup(resp_boss_company.text, "lxml")
# company_boss_desc001 = bs_boss_company.find(
# lambda tag: tag.name == "div" and tag.get("class") == ["job-sec"])
# company_boss_desc002 = ""
# if company_boss_desc001 != None:
# company_boss_desc002 = company_boss_desc001.get_text()
# else:
# company_boss_desc002 = "自己打开网址看"
# company_boss_desc = company_boss_desc002.replace('\n', "")
company_boss_desc = "自己点开看吧,爬的太慢了"
# 爬取网站 公司名称 公司网址 公司描述 工作名字 工作网址 工作描述 工作工资 工作地址 工作学历 工作经验 招聘人数 工作发布日期
data_boss = "boss" + "^" + hangye_boss_name + "^" + company_boss_name + "^" + company_boss_url + "^" + company_boss_desc + "^" + job_boss_name + "^" + job_boss_url + "^" + job_boss_desc + "^" + job_boss_money + "^" + job_boss_addr + "^" + job_boss_xueli + "^" + job_boss_year + "^" + "0" + "^" + job_boss_date + "\n"
file.write(data_boss)
print(data_boss)
print(hangye_boss_end)
page_boss += 1
except Exception as e:
traceback.print_exc()
print("boss直聘爬取异常3")
pass
print("boss直聘爬取完毕")
file.close()