python(2)

侵删。。。。。

import requests
from bs4 import  BeautifulSoup
import paramiko
import re
import time
from hdfs import *
import jsonpath
import json
import random
import os

import traceback
fileName = time.strftim("%m-%d")
file = open("g://job/"+fileName+"_boss","a",encoding="UTF-8")
print("开始爬取boss直聘")
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'}

try:
    # 确定爬取几天内的数据:1天内和3天内****************************************************************8
    aaa_boss = 1
    #以后把这些加在最前边

    hangye_boss_all = [150407,150408,150409,150410,150411,150412,150413,150414,100103,100101,100102,100104,100105,100106,100107,100108,100109,100110,100111,100112,100113,100114,100115,100116,100117,100118,100119,100120,100121,100122,100201,
    100202,100203,100204,100205,100206,100208,210607,210608,210609,210601,210602,210603,210604,210405,210605,210606,290401,300108,300107,300106,300105,300104,300103,300102,300101,300207,300206,300205,300204,300203,300202,300201,300317,300316,
    300315,300314,300313,300312,300311,100209,100210,100308,100307,100306,100305,100304,100303,100302,100301,100409,100408,100407,100406,100405,100404,100403,100402,100401,100511,100512,100513,100506,100507,100508,100509,100607,100606,100605,100604,100603,100602,
    100601,100816,100815,100814,100813,100812,100811,100810,100809,100808,100807,100806,100805,100804,100803,100802,100801,100904,100903,100902,100901,101018,101017,101016,101015,101014,101013,101012,101011,101010,101009,101008,101007,101006,101005,101004,101003,101002,101001,101404,101403,101402,101401,
    100707,100706,100705,100704,100703,100702,100701,101307,101306,101305,101304,101303,101302,101301,101202,101201,101101,110108,110107,110106,110105,110104,110103,110102,110101,110304,110303,110302,110401,120121,120120,120119,120118,
    120117,120116,120115,120114,120113,120112,120111,120110,120109,120108,120107,120106,120105,120104,120103,120102,120101,120204,120203,120202,120201,120408,120407,120404,120403,120402,120401,120406,120405,120404,120403,120402,120401,
    120610,120608,120607,120606,120605,120604,120603,120602,120601,120501,130120,130119,130118,130117,130116,130115,130114,130113,130112,130111,130110,130109,130108,130107,130106,130105,130104,130103,130102,
    130101,130206,130205,130204,130203,130202, 130201, 130309, 130308, 130307,130306, 130305, 130304, 130303, 130302, 130301, 130405, 130404, 130403, 130402, 130401, 130501, 140114, 140113, 140112, 140111, 140110,
    140109, 140108,140107,140106,140105,140104,140103,140102,140101,140206,140205,140204,140203,140202,140201,140506,140505,140504,140503,140502,140501,140601,140602,140603,140604,140605,140607,140608,140609,140610,140611,140401,140404,140405,140406,140407,140701,150107,
    150106,150105,150104,150103,150102,150101,150403,150406,150108,150109,150110,150201,150202,150204,150205,150207,150208,150401,150209,150309,150308,150404,150402,
    150307,150306,150305,150304,150303,150302,150301,150203,150502,150503,150504,150505,150506,150507,150601,140317,140316,140315,140314,140313,140312,140311,140310,140309,
    140308,140307,140306,140305,140304,140303,140302,140301,160104,160103,160102,160101,140403,140402,160201,170108,170107,170106,170105,170104,170103,170102,170101,170306,170305,170304,170303,170302,170301,
    170406,170405,170404,170403,170402,170401,170205,170204,170203,170202,170201,170207,170208,170209,170210,170211,170616,170617,170618,170619,170615,170614,170613,170612,170611,170610,170609,170608,170607,170606,170605,170604,170603,170602,170601,170501,180119,
    180111,180118,180117,180116,180115,180114,180113,180112,180104,180103,180101,180204,180203,180202,180201,180304,180303,
     180302,180301,180107,180406,180405,180105,180404,180403,180402,180102,180401,180110,180109,180108,180107,180106,180105,180104,180103,180102,180101,180703,180702,180701,
         180802,180801,180601,230108,230107,230106,230105,230104,
    230103,230102,230101,230110,230212,230211,230210,230209,230209,230208,230207,230206,230205,230204,230203,230202,230201,230301,190107,190106,190105,190104,190103,190102,190101,190204,190203,190202,190201,190313,190312,190311,190310,190309,
    190308,190307,190306,190305,190304,190303,190302,190301,190411,190410,190409,190408,190407,190406,190405,190404,190403,190402,190401,190504,190503,190502,190501,190603,190602,190601,190707,190706,190705,190704,190703,190702,
    190701,190801,210803,210802,210801,210112,210113,210114,210115,210116,210117,210118,210119,210120,210121,210122,210101,210102,210103,210104,210105,210106,210107,210108,210109,210110,210111,210201,210202,210305,210304,210303,210302,
    210301,210404,210403,210402,210401,210504,210505,210506,210503,210502,210501,210701,250107,250106,250105,250104,
    250103,250102,250101,250204,250203,250202,250201,250301,240117,240116,240115,240114,240113,240112,240111,240110,240109,240108,240107,240106,240105,240104,240103,240102,240101,240206,240205,240204,240203,240202,240201,240304,240303,240302,240301,240402,240401,240501,220103,220102,220101,220212,
    220211,220210,220209,220208,220207,220206,220205,220204,220203,220202,220201,220303,220302,220301,220403,220402,220401,220505,220504,220503,220502,220501,220601,260110,260109,260108,260107,260106,260105,260104,260103,260102,260101,260203,260202,260201,260308,260307,260306,260305,260304,260303,
    260302,260301,260404,260404,260402,260401,260501,270103,270102,270101,270201,280105,280104,280103,280102,280101,280202,280201,280301,290105,290106,290104,290103,290102,290101,290211,290210,290206,290207,290208,290209,290201,
    290202,290203,290204,290205,290301,290305,290306,290307,290302,290303,290304,300310,300309,300308,300307,300306,300305,300304,300303,300302,300301,300406,300405,300404,300403,300402,300401,300510,300509,300508,300507,300506,300505,300504,300503,300502,300501,300606,300605,
300604,300603,300602,300601,300701,200101]

    for hangye_boss_end in hangye_boss_all:
        page_boss = 1
        flag_zhonghua = True
        while flag_zhonghua:
            url_boss = ""
            # 一天内
            if aaa_boss == 1:
                url_boss = "https://www.zhipin.com/c100010000-p" + str(hangye_boss_end) + "/?period=1&page=" + str(
                    page_boss) + "&ka=page-" + str(page_boss)

            # 三天内
            if aaa_boss == 3:
                url_boss = "https://www.zhipin.com/c100010000-p" + str(hangye_boss_end) + "/?period=2&page=" + str(
                    page_boss) + "&ka=page-" + str(page_boss)

            # 七天内
            if aaa_boss == 7:
                url_boss = "https://www.zhipin.com/c100010000-p" + str(hangye_boss_end) + "/?period=3&page=" + str(
                    page_boss) + "&ka=page-" + str(page_boss)

            # 十五天内
            if aaa_boss == 15:
                url_boss = "https://www.zhipin.com/c100010000-p" + str(hangye_boss_end) + "/?period=4&page=" + str(
                    page_boss) + "&ka=page-" + str(page_boss)

            try:
                resp_boss = requests.get(url_boss, headers=headers)
            except:
                print("boss直聘爬取异常1")
                continue

            resp_boss.encoding = "utf-8"
            hangye_boss = ""
            print(page_boss)
            bs_boss = BeautifulSoup(resp_boss.text, "lxml")

            befor_split = bs_boss.find("title").get_text().replace("【全国","")
            hangye_boss_name = befor_split[:befor_split.index("招聘")]

            try:
                url_page = bs_boss.find(lambda tag: tag.name == "div" and tag.get("class") == ["page"]).find(
                    lambda tag: tag.name == "a" and tag.get("class") == ["cur"]).get_text()
            except:
                break


            if int(url_page) - page_boss != 0:
                break
            f_all_boss = bs_boss.find_all(lambda tag: tag.name == "div" and tag.get("class") == ["job-primary"])
            for f_boss in f_all_boss:
                # 工作信息
                job_boss = f_boss.find(lambda tag: tag.name == "div" and tag.get("class") == ["info-primary"])
                job_boss_url = "https://www.zhipin.com" + job_boss.find("a")["href"]
                job_boss_name = job_boss.find(
                    lambda tag: tag.name == "div" and tag.get("class") == ["job-title"]).get_text()
                job_boss_money = job_boss.find("span").get_text()
                job_boss_aaa = str(job_boss.find("p"))
                job_boss_aaa01 = job_boss_aaa.replace("<p>", "").replace("</p>", "").replace('<em class="vline"></em>',
                                                                                             ",")
                job_boss_aaa02 = job_boss_aaa01.split(",")
                job_boss_addr = job_boss_aaa02[0]
                job_boss_year = job_boss_aaa02[1]
                job_boss_xueli = job_boss_aaa02[2]

                # 获取发布时间
                job_boss_date = ""
                today_boss = time.strftime("%m-%d")
                time_boss = f_boss.find(lambda tag: tag.name == "div" and tag.get("class") == ["info-publis"]).find("p")
                if time_boss != None:
                    time_boss01 = time_boss.get_text()
                    time_boss02 = time_boss01.split("发布于")
                    if len(time_boss02[1].split(":")) > 1:
                        job_boss_date = today_boss
                    elif time_boss02[1] == "昨天":
                        month_day = today_boss.split("-")
                        month_boss = int(month_day[0])
                        day_boss = int(month_day[1])
                        if day_boss > 1:
                            job_boss_date = str(month_boss) + "-" + str(day_boss - 1)
                        else:
                            if month_boss - 1 in [1, 3, 5, 7, 8, 10]:
                                if month_boss == 1:
                                    job_boss_date = "12-31"
                                else:
                                    job_boss_date = str(month_boss - 1) + "-31"
                            else:
                                job_boss_date = str(month_boss - 1) + "-30"
                    else:
                        temp01 = time_boss02[1].split("月")
                        temp_month = temp01[0]
                        temp_day = temp01[1].split("日")[0]
                        job_boss_date = temp_month + "-" + temp_day
                try:
                    resp_boss_job = requests.get(job_boss_url, headers=headers)
                except:
                    print("boss直聘爬取异常2")
                    continue

                resp_boss_job.encoding = "utf-8"

                bs_boss_job = BeautifulSoup(resp_boss_job.text, "lxml")
                job_boss_desc001 = bs_boss_job.find(lambda tag: tag.name == "div" and tag.get("class") == ["job-sec"])
                job_boss_desc003 = None
                if job_boss_desc001 != None:
                    job_boss_desc003 = job_boss_desc001.find(
                        lambda tag: tag.name == "div" and tag.get("class") == ["text"])

                job_boss_desc002 = ""
                if job_boss_desc001 != None and job_boss_desc003 != None:
                    job_boss_desc002 = job_boss_desc003.get_text()
                else:
                    job_boss_desc002 = "自己打开网址看"
                job_boss_desc = job_boss_desc002.replace('\n', "")

                # 公司信息
                company_boss = f_boss.find(lambda tag: tag.name == "div" and tag.get("class") == ["company-text"]).find(
                    "a")
                company_boss_name = company_boss.get_text()
                company_boss_url = "https://www.zhipin.com" + company_boss["href"]
                # try:
                #     resp_boss_company = requests.get(company_boss_url, headers=headers)
                # except:
                #     continue
                # resp_boss_company.encoding = "utf-8"
                #
                # bs_boss_company = BeautifulSoup(resp_boss_company.text, "lxml")
                # company_boss_desc001 = bs_boss_company.find(
                #     lambda tag: tag.name == "div" and tag.get("class") == ["job-sec"])
                # company_boss_desc002 = ""
                # if company_boss_desc001 != None:
                #     company_boss_desc002 = company_boss_desc001.get_text()
                # else:
                #     company_boss_desc002 = "自己打开网址看"
                # company_boss_desc = company_boss_desc002.replace('\n', "")
                company_boss_desc = "自己点开看吧,爬的太慢了"
                #            爬取网站         公司名称                 公司网址                  公司描述                工作名字            工作网址             工作描述               工作工资               工作地址            工作学历              工作经验             招聘人数        工作发布日期
                data_boss = "boss" + "^" + hangye_boss_name + "^" + company_boss_name + "^" + company_boss_url + "^" + company_boss_desc + "^" + job_boss_name + "^" + job_boss_url + "^" + job_boss_desc + "^" + job_boss_money + "^" + job_boss_addr + "^" + job_boss_xueli + "^" + job_boss_year + "^" + "0" + "^" + job_boss_date + "\n"

                file.write(data_boss)
                print(data_boss)
                print(hangye_boss_end)
            page_boss += 1



except Exception as  e:
    traceback.print_exc()
    print("boss直聘爬取异常3")
    pass
print("boss直聘爬取完毕")
file.close()






  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值