python获取职位信息

 51job爬取完整代码:

import requests
from bs4 import BeautifulSoup
import datetime
import json
import xlwings as xw
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions, ActionChains
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.keys import Keys
import csv
import multiprocessing
import os
import numpy as np


# 获取职位信息
def jobMesssage(item):
    df = pd.DataFrame()
    item.list = item.find_all('a', attrs={'class': 'el'})  # 获取招聘岗位信息
    for i, item in enumerate(item.list):
        try:
            df['招聘职位网址'] = item.get('href'),
            df['岗位名称'] = item.find_all('span')[0].text,
            df['发布日期'] = item.find_all('span')[1].text,
            df['薪资'] = item.find_all('span')[2].text,  #
            df['工作地及要求'] = item.find_all('span')[3].text,  #
            # df_all=pd.concat([df,df_all],axis=1)
            item.list = item.find_all('p', attrs={'class': 'tags'})
            for i, item.list in enumerate(item.list):
                df['福利'] = item.list.get('title'),  #
            print(str(i), '招聘职位写入正常')
        except:
            print(str(i), '招聘职位写入正常')
    return df


# 获取职位对应公司信息
def jobFirm(item):
    df = pd.DataFrame()
    item.list = item.find_all('div', attrs={'class': 'er'})  # 获取招聘公司信息
    for i, item in enumerate(item.list):
        # print(item,i,sep=',')
        # print(item.find_all('p')[1].text)
        try:
            df['招聘公司网址'] = item.find('a').get('href'),
            df['公司名称'] = item.find('a').text,
            df['公司规模'] = item.find_all('p')[0].text,
            df['所属行业'] = item.find_all('p')[0].text,
            print(str(i), '招聘公司写入正常')
        except:
            print(str(i), '招聘公司写入异常')
    return df


# 职位要求
def jobRequire(html):
    df = pd.DataFrame()
    # with open('jobhtmlText.html', 'r', encoding='utf-8') as f:
    #     html = BeautifulSoup(f, 'html.parser')
    # html.list = html.find_all('div', attrs={'class': 'tHeader tHjob'})
    html.list = html.find_all('div', attrs={'class': 'tCompany_main'})
    # print(html.list)
    for i, item in enumerate(html.list):
        try:
            # contactInf=item.find_all('div', attrs={'class': 'tBorderTop_box'})[1].find('span').text.strip('') #联系方式
            # officeAddress=item.find_all('div', attrs={'class': 'tBorderTop_box'})[1].find('p').text#上班地址
            jobRequir_a = item.find('div', attrs={'class': 'tBorderTop_box'}).text.strip('').replace('\n', '').replace(
                '\t', '') \
                .replace(' ', '')  # 任职要求
            # print(jobRequir_a, i, sep='')
            item.list = item.find('div', attrs={'class': 'tBorderTop_box'}).find_all('p')
            jobRequir = []  # 职位要求
            for i, item in enumerate(item.list):
                jobRequir.append(item.text.strip('') + '\n')
                jobRequirText = ''.join(jobRequir)
                # print(jobRequirText)
                # print(jobRequirText.find('任职要求'))
                if jobRequirText.find('任职要求') > 0:
                    df['招聘要求'] = jobRequirText,
                else:
                    df['招聘要求'] = jobRequir_a,
            # print(df)
            print(str(i), '职位信息写入正常')
        except:
            print(str(i), '职位信息写入异常')
    return df


# 招聘公司信息获取
def firmMeessage(html):
    df = pd.DataFrame()
    # with open('jobhtmlText.html', 'r', encoding='utf-8') as f:
    #     html = BeautifulSoup(f, 'html.parser')
    html.list = html.find_all('div', attrs={'class': 'tCompany_full'})
    # print(html.list)
    for i, item in enumerate(html.list):
        item.list = item.find_all('div', attrs={'class': 'tBorderTop_box'})
        # print(item.list[0].text.strip('').replace('\n', '').replace('\t', '').replace(' ', ''))
        # for i, item in enumerate(item.list):
        #     print(item.text,i,sep='')
        try:
            df['公司信息'] = item.list[0].text.strip('').replace('\n', '').replace('\t', '').replace(' ', ''),
            # print(df)
            print(str(i), '公司信息写入正常')
        except:
            print(str(i), '公司信息写入异常')
    return df
class writeExcel:
    def __init__(self, data):
        self.data = data
        # print(data)

    def wE_r(self):
        app = xw.App(visible=False, add_book=False)
        new_workbook = xw.Book()
        new_worksheet = new_workbook.sheets.add('worksheet')
        app.display_alerts = False
        app.screen_updating = False
        new_worksheet.range('l:l').row_height = 20
        new_worksheet.range('l:l').column_width = 11
        title = ["序号", "岗位名称", "发布日期", "薪资", "工作地及要求", "公司名称", "公司规模", "所属行业", "招聘职位网址", "招聘要求",
                 "招聘公司网址", "公司信息", '福利', '关键字', '薪资范围', '标记', '顺序', '记录日期', '是否投递']
        new_worksheet['A1'].value = title
        for i in range(len(self.data)):
            new_worksheet.cells[i + 1, 0].value = i + 1
            new_worksheet.cells[i + 1, 1].value = data[i]['岗位名称']
            new_worksheet.cells[i + 1, 2].value = data[i]['发布日期']
            new_worksheet.cells[i + 1, 3].value = data[i]['薪资']
            new_worksheet.cells[i + 1, 4].value = data[i]['工作地及要求']
            new_worksheet.cells[i + 1, 5].value = data[i]['公司名称']
            new_worksheet.cells[i + 1, 6].value = data[i]['公司规模']
            new_worksheet.cells[i + 1, 7].value = data[i]['所属行业']
            new_worksheet.cells[i + 1, 8].value = data[i]['招聘职位网址']
            # new_worksheet.cells[i + 1, 9].value = data[i]['招聘要求']
            new_worksheet.cells[i + 1, 10].value = data[i]['招聘公司网址']
            # new_worksheet.cells[i + 1, 11].value = data[i]['公司信息']
            new_worksheet.cells[i + 1, 12].value = data[i]['福利']
            # 修改项目
            new_worksheet.cells[i + 1, 13].value = data[i]["关键字"] #key  # 关键字
            new_worksheet.cells[i + 1, 14].value = '15-40K' if salary == '08%252c09%252c10' else '20-30K'  # 薪资范围
            new_worksheet.cells[i + 1, 17].value = datetime.date.today()  # 薪资范围

            print(str(i), 'Excel数据写入正常')
        new_worksheet.autofit()
        new_workbook.save('jobGain.xlsx')
        new_workbook.close()
        app.quit()

    def run(self):
        pf = multiprocessing.Process(target=self.wE_r())
        pf.start()
        pf.join()


# 单独写入excl
def write_only():
    app = xw.App(visible=True, add_book=False)
    wb = app.books.open('职业发展-only.xlsx')
    # 创建一个worksheet
    sh = wb.sheets['前程无忧']
    rng_firmMeessage = [i for i in sh.range("k:k").value if i != None]  # 单元格内容
    rng_jobRequire = [i for i in sh.range("i:i").value if i != None]  # 单元格内容
    j = sh.range('a1').expand('table').rows.count  # 序号
    app.display_alerts = False
    # app.screen_updating = False
    myWeb = Web(job_url)  # 实例化类
    for i in range(len(rng_jobRequire) - 1):
        try:
            html = myWeb.web_b(rng_firmMeessage[i + 1])  # 获取公司信息
            time.sleep(0.2)
            print(rng_firmMeessage[i + 1])
            df5 = firmMeessage(html)
            print(df5)
            sh.cells[i + 1, 11].value = df5.iloc[0, 0]
            print(str(i), '公司信息写入正常')
        except:
            print(str(i), "公司信息写入错误")
        # 获取招聘要求信息
        try:
            html = myWeb.web_a(rng_jobRequire[i + 1])  # 获取招聘要求信息
            print(rng_jobRequire[i + 1])
            df4 = jobRequire(html)
            print(df4)
            # print(df4.index)
            # print(df4.iloc[0,0])
            sh.cells[i + 1, 9].value = df4.iloc[0, 0]
            print(str(i), '招聘要求写入正常')
            # 点击沟通
            ''''''
            time.sleep(0.5)
            driver.find_element_by_xpath(
                ".//div[contains(@class,'op')]/a[@class='but_sq']").click()#"//div[@class='txt']/input[@id='password']"
            print(str(i), '已投递')
            time.sleep(1.5)
            sh.cells[i + 1, 18].value = datetime.date.today()
            print(str(i), '已沟通')
            ''''''
        except:
            print(str(i), "数据查询错误")
            print(str(i), '投递失败')
            sh.cells[i + 1, 18].value ='沟通失败'

    # sh.autofit()
    wb.save('职业发展-only.xlsx')
    wb.close()
    app.quit()

class Web:
    def __init__(self, url):
        self.url = url

    def web(self):
        # with open('jobhtml.html', 'r', encoding='utf-8') as f:
        # job_url = 'https://search.51job.com/list/080200,000000,0000,00,9,99,%25E7%2589%25A9%25E6%25B5%2581,2,1.html?'
        driver.back()
        time.sleep(0.5)
        driver.get(self.url)  # 加载网址
        # driver.set_page_load_timeout(5)#超时停止加载
        time.sleep(1)
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        html.list = html.find_all('div', attrs={'class': 'j_joblist'})
        return html.list

    # 招聘需求信息获取
    def web_a(self, url):
        job_url = 'https://www.baidu.com/?tn=21002492_18_hao_pg'
        driver.back()
        driver.get(job_url)  # 加载网址
        time.sleep(0.5)
        driver.get(url)  # 加载网址
        time.sleep(1)
        # 隔8次刷新一次
        # if i % 8 == 7: time.sleep(3), print('刷新{}次'.format(i))  # driver.refresh(),
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        # with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)
        return html

    # 招聘公司信息获取
    def web_b(self, url):
        job_url = 'https://www.baidu.com/?tn=21002492_18_hao_pg'
        driver.back()
        driver.get(job_url)  # 加载网址
        time.sleep(0.8)
        driver.get(url)  # 加载网址
        time.sleep(1)
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        # print(html)
        # with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)
        return html


'''
# hrbp
key = '运营经理'  # 物流经理#物流运营#物流管理#物流总监#运营经理【#物流#运营#数据#运输#仓储#配送】
salary = '08%252c09%252c10'  # 08表示1.5-20K,09表示20-30k,08%252c09%252c10表示1.5-20K,20-30K,30K以上
timeday = '3'  # 1表示近三天,2表示近一周,近一个月是3
'''
''''''
keys1 = ['物流','运输','仓储','配送']
keys2=['物流经理','物流总监','物流运营']
keys3 = ['产品','数据','运营','供应链']
keys4=['人工智能','大数据']
keys_test=['产品']
# #'运营'#【物流#运输#仓储|产品#数据#运营|物流经理#物流总监#物流运营#】
# 物流经理#物流运营#物流管理#物流总监#供应链#运营经理【#产品#运营#数据#资产|#物流#运输#仓储#配送#运力#】
key=keys2[2]
print('物流:',key)
job_url_key=2#选择是否带行业1带行业2不带行业

salary = '08%252c09%252c10'  # 08表示1.5-20K,09表示20-30k,08%252c09%252c10表示1.5-20K,20-30K,30K以上
timeday = '3'  # 1表示近三天,2表示近一周,近一个月是3
''''''
if __name__ == "__main__":
    ''''''
    #火狐
    opt = FirefoxOptions()  # ChromeOptions()  # 创建chrome参数
    # 不加载图片
    # opt.set_preference('permissions.default.image', 2)
    opt.headless = False  # 显示浏览器
    driver = webdriver.Firefox(options=opt)  # Chrome(options=opt)  # 浏览器实例化
    # 最大化当前页
    driver.maximize_window()
    # driver.set_window_size(500, 900)
    # options = FirefoxOptions()
    # selenium = webdriver.Firefox(options=options)
    ''''''
    '''
    #谷歌
    opt = ChromeOptions()  # 创建chrome参数
    # 不加载图片
    # prefs = {"profile.managed_default_content_settings.images": 2}
    # opt.add_experimental_option("prefs", prefs)
    # 超时不加载
    opt.page_load_strategy = 'eager'
    opt.headless = False  # 显示浏览器
    driver = Chrome(options=opt)  # 浏览器实例化
    '''
    # job_url = 'https://search.51job.com/list/080200,000000,0000,00,9,99,%25E7%2589%25A9%25E6%25B5%2581,2,1.html?'#带行业
    # job_url = 'https://search.51job.com/list/080200,000000,0000,00,9,99,%25E7%2589%25A9%25E6%25B5%2581,2,1.html?'  # 不带 行业
    # 扫描的登录
    job_url ='https://login.51job.com/login.php?loginway=0&isjump=0&lang=c&from_domain=i&url='
    # job_url ='https://search.51job.com/list/080200,000000,0000,00,9,99,%25E7%2589%25A9%25E6%25B5%2581,2,1.html?
   #超时停止加载
    try:
        # driver.set_page_load_timeout(8)
        driver.get(job_url)  # 加载网址
        time.sleep(1)
    except:
        driver.execute_script("window.stop()")
    try:
        # driver.set_page_load_timeout(8)
        name='zhangsen5182006@126.com'
        word=''
        driver.find_element_by_xpath("//div[@class='txt']/input[@id='loginname']").send_keys(
            name)#输入用户名
        driver.find_element_by_xpath("//div[@class='txt']/input[@id='password']").send_keys(
            word)
        time.sleep(3)
        # 选中一个复选框
        #批量选择
        checkboxes = driver.find_elements_by_xpath("//div[@class='lr_ok']/em[@id='isread_em']")
        count = 0
        if checkboxes:  # 判断是否有找到元素
            for checkbox in checkboxes:  # 循环点击找到的元素
                checkbox.click()  # 勾选复选框
                count += 1
                print("打印信息 ", count)
                time.sleep(1)
        else:
            print("没有找到元素")
        #单独选择
        # driver.find_element_by_id("isread_em").click()
        # 打印该复选框的选中状态
        print (driver.find_element_by_id("isread_em").is_selected())
        time.sleep(3)
        #点击登录按钮
        driver.find_element_by_xpath(
            "//div[@class='btnbox']/button[@data-sensor-id='sensor_login_signinButton']").click()  # 点击账户登录
        print('登录成功')
    except:
        print('登录失败')
        driver.execute_script("window.stop()")
    # # 不加载图片
    # prefs = {"profile.managed_default_content_settings.images": 2}
    # opt.add_experimental_option("prefs", prefs)
    # #应对返回back
    # job_url = 'https://search.51job.com/list/080200,000000,0000,00,9,99,%25E7%2589%25A9%25E6%25B5%2581,2,1.html?'#带行业
    # driver.get(job_url)  # 加载网址
    # 杭州,2-3万'https://search.51job.com/list/080200,000000,0000,00,9,09,%25E7%2589%25A9%25E6%25B5%2581%25E8%25BF%2590%25E8%2590%25A5,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
    # 杭州1.5-2'https://search.51job.com/list/080200,000000,0000,00,9,08,%25E7%2589%25A9%25E6%25B5%2581%25E8%25BF%2590%25E8%2590%25A5,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
    # # 招聘需求信息获取
    # myWeb = Web(job_url)  # 实例化类
    # time.sleep(0.2)
    # html = myWeb.web_a('https://jobs.51job.com/hangzhou-scq/125683481.html?s=sou_sou_soulb&t=0_0')  # 'https://jobs.51job.com/hangzhou/135496109.html?s=sou_sou_soulb&t=0_0') # 实例化网址
    # # df4 = jobRequire(html)  # 获取职位需求信息
    # df4 = jobRequire()
    # print(df4)
    # time.sleep(0.3)
    '''
    # 取前三页数据
    df = pd.DataFrame()  # 定义pands整理表格
    #加入数组
    keys = keys3 + keys4  # keys_test#keys3.extend(keys2)#确定数组序列keys=np.vstack(keys4)
    print('关键字:',keys)
    job_url_key = 1  # 选择是否带行业1带行业2不带行业
    df_key = pd.DataFrame()
    for j in range(len(keys)):
        key=keys[j]
        for i in range(6):
            try:  # '+str(i+1)+'#08表示1.5-20K,09表示20-30k
                print(str(i), '获取第{}页数据'.format(i + 1))
                #带行业去掉学历
                job_url1 = 'https://search.51job.com/list/080200,000000,0000,21,' + timeday + ',' + salary + ',' + key + ',2,' + str(
                    i + 1) + '.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=03%252c04&&jobterm=99&companysize=99&ord_field=1&dibiaoid=0&line=&welfare='#03%252c04&大专和本科学历#不带行业00|21【带行业】
                #不带行业去掉学历
                job_url2 = 'https://search.51job.com/list/080200,000000,0000,00,' + timeday + ',' + salary + ',' + key + ',2,' + str(
                    i + 1) + '.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=03%252c04&&jobterm=99&companysize=99&ord_field=1&dibiaoid=0&line=&welfare='  # 03%252c04&大专和本科学历#不带行业00|21【不带行业】
                # 选择是否带行业
                if job_url_key == 1:
                    job_url=job_url1
                else:
                    job_url=job_url2

                print(job_url)
                'https://search.51job.com/list/080200,000000,0000,21,3,08%252c09%252c10,%25E8%25BF%2590%25E8%2590%25A5,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=03%252c04&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='#增加学历degreefrom
                'https://search.51job.com/list/080200,000000,0000,21,3,08%252c09%252c10,%25E8%25BF%2590%25E8%2590%25A5,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='#增加薪资
                'https://search.51job.com/list/080200,000000,0000,21,3,09,%25E8%25BF%2590%25E8%2590%25A5,2,2.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
                # 'https://search.51job.com/list/080200,000000,0000,00,1,09,%25E7%2589%25A9%25E6%25B5%2581%25E7%25BB%258F%25E7%2590%2586,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
                # 'https://search.51job.com/list/080200,000000,0000,00,1,09,%25E7%2589%25A9%25E6%25B5%2581%25E7%25BB%258F%25E7%2590%2586,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
                # 'https://search.51job.com/list/080200,000000,0000,00,3,09,%25E7%2589%25A9%25E6%25B5%2581%25E7%25BB%258F%25E7%2590%2586,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
                # with open('jobhtml.html', 'r', encoding='utf-8') as f:
                #     html = BeautifulSoup(f, 'html.parser')
                #     html.list = html.find_all('div', attrs={'class': 'j_joblist'})
                time1 = time.time()  # 计算时长

                myWeb = Web(
                    job_url)  # 实例化类  # 'https://jobs.51job.com/hangzhou-yhq/135494019.html?s=sou_sou_soulb&t=0_0')  # 实例化网址
                time.sleep(1)
                html = myWeb.web()
                # print(html)
                for i, item in enumerate(html):
                    # print(item,i,sep=',')
                    item.list = item.find_all('div', attrs={'class': 'e'})  # 获取每个招聘岗位条目
                    for i, item in enumerate(item.list):
                        df1 = jobMesssage(item)  # 获取岗位
                        # print(df1['招聘职位网址'])
                        df2 = jobFirm(item)  # 获取公司
                        url = str(df1['招聘职位网址'].values).strip("['").strip("']").strip('')
                        print(url)
                        url_b = str(df2['招聘公司网址'].values).strip("['").strip("']").strip('')
                        print(url_b)

                        ''''''
                        #暂时不加载
                        # 招聘需求信息获取
                        myWeb = Web(job_url)  # 实例化类
                        time.sleep(0.3)
                        html = myWeb.web_a(
                            url)  # 'https://jobs.51job.com/hangzhou/135496109.html?s=sou_sou_soulb&t=0_0') # 实例化网址
                        df4 = jobRequire(html)  # 获取职位需求信息
                        print(df4)
                        time.sleep(0.5 + 0.5 + 0.5)
    
                        # 招聘公司信息获取
                        myWeb = Web(job_url)  # 实例化类
                        time.sleep(0.3)
                        html = myWeb.web_b(url_b)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
                        df5 = firmMeessage(html)  # 获取职位需求信息
                        print(df5)
                        time.sleep(0.5 + 0.5 + 0.5)
                        
                        df3 = pd.concat([df1, df2], axis=1)
                        df6 = pd.concat([df3, df4], axis=1)
                        df7 = pd.concat([df5, df6], axis=1)
                        df7.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
                        df = pd.concat([df, df7], axis=0)
                        ''''''
                        # 加入关键字
                        df_key["关键字"] = key,
                        # df2=df_key["关键字"]

                        df3 = pd.concat([df1, df2], axis=1)
                        df4=pd.concat([df3, df_key], axis=1)
                        df = pd.concat([df, df4], axis=0)
                        print(df)
                        df.to_json('jobGain.json', orient='records', indent=1, force_ascii=False)
                        time.sleep(0.1)
                time.sleep(0.1)
                print(str(i), '数据正常'.format(i + 1))

                time2 = time.time()  # 计算时长
                print('总耗时:{}'.format(time2 - time1))
            except:
                print(str(i), '数据异常'.format(i + 1))


    ''''''
    ''''''
    # key = '物流管理'  # 物流经理#物流运营
    # salary = '08'  # 08表示1.5-20K,09表示20-30k
    with open('jobGain.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
        # print(data)
        myWe = writeExcel(data)  # 写入excel
        myWe.run()  # 执行多线程
    '''

    ''''''
    #单独写入excel+批量沟通
    write_only()
    ''''''

    try:  # 关闭后台浏览器
        driver.close()
        driver.quit()
        os.system('taskkill /F /IM chromedriver.exe')  # 关闭进程浏览器
        sreach_windows = driver.current_window_handle
        # 获得当前所有打开的窗口的句柄
        all_handles = driver.window_handles
        for handle in all_handles:
            driver.switch_to.window(handle)
            driver.close()
            time.sleep(1.2)
    except:
        print('已完后台毕浏览器')







猎聘网爬取数据:

import requests
from bs4 import BeautifulSoup
import datetime
import json
import xlwings as xw
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions, ActionChains
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.keys import Keys
import csv
import multiprocessing
import os
import numpy as np

# 获取招聘职位信息
def jobMesssage(html):
    df_jobMesssage = pd.DataFrame()
    df = pd.DataFrame()
    # with open('jobhtml.html', 'r', encoding='utf-8') as f:
    #     html = BeautifulSoup(f, 'html.parser')
    # html.list = html.find_all('div', attrs={'class': 'left-list-box'})
    for i, item in enumerate(html):
        item.list = item.find_all('div', attrs={'class': 'job-detail-box'})
        for i, item in enumerate(item.list):
            # print(item, i, sep=',')
            # print(item.find('div', attrs={'class': 'job-detail-header-box'}).find('span', attrs={'class': 'job-salary'}).text,i,sep=',')
            try:
                df_jobMesssage['招聘职位网址'] = item.find('a', attrs={'data-nick': 'job-detail-job-info'}).get('href'),
                df_jobMesssage['岗位名称'] = item.find('a', attrs={'data-nick': 'job-detail-job-info'}).find('div', attrs={
                    'class': 'job-title-box'}).text.strip('').replace('\n', '').replace('\t', ''),
                df_jobMesssage['工作地及要求'] = item.find('a', attrs={'data-nick': 'job-detail-job-info'}).find('div',
                                                                                                           attrs={
                                                                                                              'class': 'job-labels-box'}).text.strip(
                    '').replace('\n', '').replace('\t', ''),  #
                df_jobMesssage['公司名称'] = item.find('div', attrs={'data-nick': 'job-detail-company-info'}).find('div',
                                                                                                               attrs={
                                                                                                                   'class': 'job-company-info-box'}).text.strip(
                    '').replace('\n', '').replace('\t', '')
                df_jobMesssage['薪资'] = item.find('div', attrs={'class': 'job-detail-header-box'}).find('span', attrs={
                    'class': 'job-salary'}).text

                # print(df_jobMesssage)
                df_jobMesssage.to_csv('job.csv', mode='a+', header=None, index=True, encoding='utf-8-sig', sep=',')
                df = pd.concat([df, df_jobMesssage], axis=0)
                # df.to_json('jobliepin.json', orient='records', indent=1, force_ascii=False)

                print(str(i), '招聘职位写入正常')
            except:
                print(str(i), '招聘职位写入正常')

    return df


# 获取招聘要求和公司信息
def jobRequire(url):
    df = {}  # 定义字典
    #cookie登录
    '''
    # url='https://www.liepin.com/a/29686195.shtml?d_sfrom=search_prime&d_ckId=c8f01cee484fdfafc8e1e5d047a1e1d1&d_curPage=0&d_pageSize=40&d_headId=6ae8e76ae415c8d307347eef4182b4e4&d_posi=38'
    cookie = 'Cookie: __gc_id=70a4e01358d04ed79ef74f4fc23c222c; __s_bid=8a2d91b9f536cac458eb326f0b49215d68c7; __uuid=1650380251322.69; need_bind_tel=false; new_user=false; c_flag=0c4573f8bb0cb2b812aa25ce8c3b2f86; imClientId=fdbaa46cb37362142adfca85e0d0ca5a; imId=fdbaa46cb37362143236da4ede1d7827; imClientId_0=fdbaa46cb37362142adfca85e0d0ca5a; imId_0=fdbaa46cb37362143236da4ede1d7827; Hm_lvt_a2647413544f5a04f00da7eee0d5e200=1653529592,1653532082,1653565242; __tlog=1653565242345.73%7C00000000%7C00000000%7C00000000%7C00000000; acw_tc=276082a116535652427123225e588c83c08226ad9f70474ad06a4088eedcb7; __session_seq=8; __uv_seq=36; Hm_lpvt_a2647413544f5a04f00da7eee0d5e200=1653566696'
    headers = {
        'user-agent': 'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
        'Cookie': cookie,
        'Connection': 'keep - alive',
    }
    # 新闻链接
    # session = requests.session()
    res = requests.get(url=url, headers=headers, timeout=30)
    res.encoding = 'utf-8'
    res.raise_for_status()
    res.encoding = res.apparent_encoding
    html = BeautifulSoup(res.text, 'html.parser')
    time.sleep(0.5)
    # print(html)
    # 存入本地
    with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        f.write(res.text)
    # with open('jobhtmlText.html', 'r', encoding='utf-8') as f:
    #     html = BeautifulSoup(f, 'html.parser')
    '''
    #加载浏览器
    ''''''
    myWeb = Web(url)  # 实例化类
    html = myWeb.web_b(url) # 招聘要求和公司信息
    ''''''
    html.list = html.find_all('content')  # 整体框架
    for i, item in enumerate(html):
        # item.list = item.find_all('section', attrs={'class': 'company-intro-container'})[0].text#上级框架
        # print(item.list)
        try:
            df['招聘要求'] = item.find_all('section', attrs={'class': 'job-intro-container'})[0].text.strip('\n'),
            df['公司信息'] = item.find_all('section', attrs={'class': 'company-intro-container'})[0].text.strip('\n'),
            # df.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
            # df.to_json('jobliepin.json', orient='records', indent=1, force_ascii=False)
            print(df)
            print(str(i), '招聘职位写入正常')
        except:
            print(str(i), '招聘职位写入正常')

    return df
# 单独写入excl
def write_only():
    app = xw.App(visible=True, add_book=False)
    wb = app.books.open('职业发展-only.xlsx')
    # 创建一个worksheet
    sh = wb.sheets['猎聘']
    rng_jobRequire = [i for i in sh.range("i:i").value if i != None]  # 单元格内容
    j = sh.range('a1').expand('table').rows.count  # 序号
    app.display_alerts = False
    # app.screen_updating = False
    myWeb = Web(url)  # 实例化类
    for i in range(len(rng_jobRequire) - 1):
        try:
            #写入招聘要求和公司信息
            ''''''
            df_w = jobRequire(rng_jobRequire[i + 1])
            time.sleep(0.5)
            try:
                print(rng_jobRequire[i + 1])
                sh.cells[i + 1, 9].value = df_w['招聘要求']  # str(df_w['招聘要求'].values).strip("['").strip("']").strip('')
                sh.cells[i + 1, 11].value =df_w['公司信息']  # str(df_w['公司信息'].values).strip("['").strip("']").strip('')
            except:
                print(str(i), "数据查询错误")
            ''''''
            # html = myWeb.web_a(rng_jobRequire[i + 1])  # 获取招聘要求信息
            # print(rng_jobRequire[i + 1])
            # 点击沟通
            '''
            time.sleep(0.5)
            driver.find_element_by_xpath(
                ".//div[contains(@class,'apply-box')]/a[contains(@class,'btn-main')]").click()#"//div[@class='txt']/input[@id='password']"
            # print(str(i), '招聘要求写入正常')
            time.sleep(1)
            sh.cells[i + 1, 18].value = datetime.date.today()
            print(str(i), '已沟通')
            '''
        except:
            print(str(i), '沟通失败')
            sh.cells[i + 1, 18].value ='沟通失败'

    # sh.autofit()
    wb.save('职业发展-only.xlsx')
    wb.close()
    app.quit()

class Web:
    def __init__(self, url):
        self.url = url

    # 获取招聘职位信息
    def web(self):
        driver.back()
        time.sleep(0.5)
        driver.get(self.url)  # 加载网址
        time.sleep(1)
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        html.list = html.find_all('div', attrs={'class': 'left-list-box'})
        # with open('jobhtml.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)
        return html.list

    # 获取招聘要求和公司信息
    def web_a(self, url):
        driver.back()
        time.sleep(0.5)
        driver.get(url)  # 加载网址
        time.sleep(1)
        # 隔8次刷新一次
        # if i%8 == 7: time.sleep(3), print('刷新{}次'.format(i))  # driver.refresh(),
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        html.list = html.find_all('content')  # 整体框架
        # with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)
        return html.list
        # 获取招聘要求和公司信息

    def web_b(self, url):#备用获取网页主体
        driver.back()
        time.sleep(0.5)
        driver.get(url)  # 加载网址
        time.sleep(1)
        # 隔8次刷新一次
        # if i%8 == 7: time.sleep(3), print('刷新{}次'.format(i))  # driver.refresh(),
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        # html.list = html.find_all('content')  # 整体框架
        # with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)
        return html


class writeExcel:
    def __init__(self, data):
        self.data = data
        # print(data)

    def wE_r(self):
        app = xw.App(visible=False, add_book=False)
        new_workbook = xw.Book()
        new_worksheet = new_workbook.sheets.add('worksheet')
        app.display_alerts = False
        app.screen_updating = False
        title = ["序号", "岗位名称", "发布日期", "薪资", "工作地及要求", "公司名称", "公司规模", "所属行业", "招聘职位网址", "招聘要求",
                 "招聘公司网址", "公司信息", '福利', '关键字', '薪资范围', '标记', '顺序', '记录日期']
        new_worksheet['A1'].value = title

        for i in range(len(self.data)):
            try:
                #写入职位信息
                # df_w = jobRequire(data[i]['招聘职位网址'])
                # print(data[i]['招聘职位网址'])

                # if i%9==8:
                #     time.sleep(20)#每取8个停下8秒应对反扒
                # else:
                #     time.sleep(0.2)
                new_worksheet.cells[i + 1, 0].value = i + 1
                new_worksheet.cells[i + 1, 1].value = data[i]['岗位名称']
                new_worksheet.cells[i + 1, 2].value = ''  # data[i]['发布日期']
                new_worksheet.cells[i + 1, 3].value = data[i]['薪资']
                new_worksheet.cells[i + 1, 4].value = data[i]['工作地及要求']
                new_worksheet.cells[i + 1, 5].value = data[i]['公司名称']
                new_worksheet.cells[i + 1, 6].value = ''  # data[i]['公司规模']
                new_worksheet.cells[i + 1, 7].value = ''  # data[i]['所属行业']
                new_worksheet.cells[i + 1, 8].value = data[i]['招聘职位网址']
                new_worksheet.cells[i + 1, 9].value = ''#df_w['招聘要求']  # str(df_w['招聘要求'].values).strip("['").strip("']").strip('')
                new_worksheet.cells[i + 1, 10].value = ''  # data[i]['招聘公司网址']
                new_worksheet.cells[i + 1, 11].value = ''#df_w['公司信息']  # str(df_w['公司信息'].values).strip("['").strip("']").strip('')
                new_worksheet.cells[i + 1, 12].value = ''  # data[i]['福利']
                # 修改项目
                new_worksheet.cells[i + 1, 13].value =data[i]["关键字"] #key  # 关键字
                new_worksheet.cells[i + 1, 14].value = salary  # 薪资范围
                new_worksheet.cells[i + 1, 17].value = datetime.date.today()  # 薪资范围
                print(str(i), 'Excel数据写入正常')
            except:
                print(str(i), 'Excel数据写入异常')

        # new_worksheet.autofit()
        new_workbook.save('jobliepin.xlsx')
        new_workbook.close()
        app.quit()

    def run(self):
        pf = multiprocessing.Process(target=self.wE_r())
        pf.start()
        pf.join()


df=pd.DataFrame()#定义    全局变量
keys1 = ['物流','运输','仓储','配送']
keys2=['物流经理','物流总监','物流运营']
keys3 = ['产品','数据','运营','供应链']
keys4=['人工智能','大数据']
keys_test=['物流']
# #'运营'#【物流#运输#仓储|产品#数据#运营|物流经理#物流总监#物流运营#】
# 物流经理#物流运营#物流管理#物流总监#供应链#运营经理【#产品#运营#数据#资产|#物流#运输#仓储#配送#运力#】
key=keys2[2]
print('关键字:',key)
job_url_key=2#选择是否带行业1带行业2不带行业

salary='15$30'#20$40#10$20【15$30】
timeday='7'#7表示一周内,3表示3天内,30表示近一个月
if __name__ == "__main__":
    ''''''
    #google浏览器
    # jobRequire()
    opt = ChromeOptions()  # 创建chrome参数
    opt.headless = False  # 显示浏览器
    driver = Chrome(options=opt)  # 浏览器实例化
    # 最大化当前页
    driver.maximize_window()
    #绕开反扒程序
    driver.execute_cdp_cmd(
        "Page.addScriptToEvaluateOnNewDocument", {
            "source": """
           Object.defineProperty(navigator, 'webdriver', {
             get: () => undefined
           })
           """
        })
    # driver=webdriver.Chrome()
    # driver.set_window_size(300, 700)
    ''''''
    '''
    # 火狐
    opt = FirefoxOptions()  # ChromeOptions()  # 创建chrome参数
    # 不加载图片
    # opt.set_preference('permissions.default.image', 2)
    opt.headless = False  # 显示浏览器
    driver = webdriver.Firefox(options=opt)  # Chrome(options=opt)  # 浏览器实例化
    # driver.set_window_size(500, 900)
    # options = FirefoxOptions()
    # selenium = webdriver.Firefox(options=options)
    '''
    ''''''
    # 扫描的登录
    # url =('https://www.baidu.com/')  # 加载网址
    # driver.get(url)  # 加载网址
    # driver.back()
    url = 'https://c.liepin.com/?time=1653531201141'#'https://www.liepin.com/'
    driver.get(url)  # 加载网址
    time.sleep(1)
    ''''''
    ''''''
    # driver.find_element_by_xpath( "//div[@class='jsx-2755331170 top-switch-icon']").click()  # 点击电脑登录
    # driver.find_element_by_id('login').click()  # 模拟点击
    driver.find_element_by_xpath(
        "//div[@class='jsx-1730583093 ']").click()  # 点击账户登录
    name='zhangsen5182006@126.com'
    word=''
    driver.find_element_by_xpath("//div[@class='ant-form-item-control-input-content']/span[@class='ant-input-affix-wrapper ant-input-affix-wrapper-lg']/input[@id='login']").send_keys(
        name)#输入用户名
    driver.find_element_by_xpath("//span[@class='ant-input-affix-wrapper ant-input-affix-wrapper-lg']/input[@id='pwd']").send_keys(
        word)
    time.sleep(3)
    # 选中一个复选框
    # 批量选择
    checkboxes = driver.find_elements_by_xpath("//span[@class='ant-checkbox']/input[@class='ant-checkbox-input']")
    count = 0
    if checkboxes:  # 判断是否有找到元素
        for checkbox in checkboxes:  # 循环点击找到的元素
            checkbox.click()  # 勾选复选框
            count += 1
            print("打印信息 ", count)
            time.sleep(1)
    else:
        print("没有找到元素")
    # 单独选择
    # driver.find_element_by_class_name("ant-checkbox-input").click()
    # 打印该复选框的选中状态
    print(driver.find_element_by_class_name("ant-checkbox-input").is_selected())
    time.sleep(3)
    # 点击登录按钮
    driver.find_element_by_xpath(
        "//form[@class='ant-form ant-form-horizontal ant-form-large']/button[@class='ant-btn ant-btn-primary ant-btn-round ant-btn-lg login-submit-btn']").click()  # 点击账户登录
    print('登录成功')
    ''''''
    '''
    # 加入数组
    keys=keys3+keys4#keys_test#keys3.extend(keys2)#确定数组序列keys=np.vstack(keys4)
    print('关键字:', keys)
    job_url_key = 1  # 选择是否带行业1带行业2不带行业
    df_key = pd.DataFrame()
    for j in range(len(keys)):
        key = keys[j]
        for i in range(6):  # +str(i);key=
            try:
                print(str(i), '获取第{}页数据'.format(i + 1))
                # 物流行业
                job_url1 = 'https://www.liepin.com/zhaopin/?headId=9f577a23fdb5d9437efff7679944c610&key=' + str(
                    key) + '&industry=9$250&dq=070020&salary=' + salary + '&pubTime=' + timeday + '&currentPage=' + str(i)
                # #未分配行业
                job_url2 = 'https://www.liepin.com/zhaopin/?headId=9f577a23fdb5d9437efff7679944c610&key=' + str(
                    key) + '&dq=070020&salary=' + salary + '&pubTime=' + timeday + '&currentPage=' + str(i)
                # 选择是否带行业
                if job_url_key == 1:
                    job_url = job_url1
                else:
                    job_url = job_url2

                print(job_url)
                # 'https://www.liepin.com/zhaopin/?headId=9f577a23fdb5d9437efff7679944c610&ckId=l07j4hoqgyh0gdr1cskm5ur0c8umz86a&oldCkId=84c318d34f244edd65090ab5353419c3&fkId=myu97a638ugqeosooma1w35gexn1tw78&skId=44ef33b0864b17ba4a80114662f6d01a&sfrom=search_job_pc&key=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&industry=9$250&dq=070020&salary=15$30&pubTime=7&customSalary=1&currentPage=1&scene=page'
                # 'https://www.liepin.com/zhaopin/?headId=9f577a23fdb5d9437efff7679944c610&ckId=6aa2zbc9ptmwb1w7909zc2vm047p6uib&fkId=myu97a638ugqeosooma1w35gexn1tw78&skId=44ef33b0864b17ba4a80114662f6d01a&sfrom=search_job_pc&key=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&industry=&dq=070020&salary=15$30&pubTime=7&customSalary=1&scene=condition'
                # 'https://www.liepin.com/zhaopin/?headId=12baac27653545ffceb6a268fc0c82aa&ckId=12baac27653545ffceb6a268fc0c82aa&key=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&dq=070020&salary=20$40&pubTime=3&currentPage=1'
                # 'https://www.liepin.com/zhaopin/?headId=12baac27653545ffceb6a268fc0c82aa&key=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&dq=070020&salary=10$20&pubTime=3'
                # 'https://www.liepin.com/zhaopin/?headId=9f577a23fdb5d9437efff7679944c610&key=%E7%89%A9%E6%B5%81%E7%AE%A1%E7%90%86&dq=070020&salary=20$40&pubTime=3'
                # job_url_a='https://www.liepin.com/a/30216633.shtml?d_sfrom=search_prime&d_ckId=10e193c94fdc8095c14815c02246e6e7&d_curPage=0&d_pageSize=40&d_headId=6ae8e76ae415c8d307347eef4182b4e4&d_posi=2'
                time1 = time.time()  # 计算时长
                # 获取招聘职位信息
                myWeb = Web(job_url)  # 实例化类
                html = myWeb.web()  # 招聘要求和公司信息
                time.sleep(1)
                # print(html)
                df1 = jobMesssage(html)
                #加入关键字
                df_key["关键字"]=key,
                # df2=df_key["关键字"]
                df3 = pd.concat([df1, df_key], axis=1)
                df = pd.concat([df, df3], axis=0)
                # df = pd.concat([df1, df], axis=0)
                df.to_json('jobliepin.json', orient='records', indent=1, force_ascii=False)
                time2 = time.time()  # 计算时长
                print(str(i), '数据正常'.format(i + 1))
                print('总耗时:{}'.format(time2 - time1))
            except:
                print(str(i), '数据异常'.format(i + 1))
    
    ''''''
    ''''''
    # 写入excel
    with open('jobliepin.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
        # print(data)
        myWe = writeExcel(data)  # 写入excel
        myWe.run()  # 执行多线程
    '''

    ''''''
    # 单独写入excel+批量沟通
    write_only()
    ''''''

    try:  # 关闭后台浏览器
        driver.close()
        driver.quit()
        os.system('taskkill /F /IM chromedriver.exe')  # 关闭进程浏览器
        sreach_windows = driver.current_window_handle
        # 获得当前所有打开的窗口的句柄
        all_handles = driver.window_handles
        for handle in all_handles:
            driver.switch_to.window(handle)
            driver.close()
            time.sleep(1.2)
    except:
        print('已完后台毕浏览器')



BOss直聘网站:

import requests
from bs4 import BeautifulSoup
import datetime
import json
import xlwings as xw
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions, ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.keys import Keys
import csv
import multiprocessing
import os


def jobMesssage(html):
    df_jobMesssage = pd.DataFrame()
    df = pd.DataFrame()
    # with open('jobhtml.html', 'r', encoding='utf-8') as f:
    #     html = BeautifulSoup(f, 'html.parser')
    # html.list = html.find_all('div', attrs={'class': 'job-list'})
    # print(html.list)
    for i, item in enumerate(html):
        item.list = item.find_all('div', attrs={'class': 'job-primary'})
        # print(item,i,sep=',')
        for i, item in enumerate(item.list):  # 获取每个招聘条目
            # print(item, i, sep=',')
            try:
                item.list = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ', '').replace(
                    '\n', ' ')
                print(item.list, i, sep=',')

                df_jobMesssage['招聘职位网址'] = 'https://www.zhipin.com' + item.find('div',
                                                                                attrs={'class': 'primary-box'}).get(
                    'href'),
                df_jobMesssage['岗位名称'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={
                    'class': 'job-name'}).text,
                df_jobMesssage['工作地及要求'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={
                    'class': 'job-area-wrapper'}).text.strip('\n'),  #
                df_jobMesssage['公司名称'] = item.find('div', attrs={'class': 'info-company'}).text.replace(' ',
                                                                                                        '').replace(
                    '\n', ' '),
                df_jobMesssage['薪资'] = item.find('div', attrs={'class': 'job-limit clearfix'}).text.strip('').replace(
                    '\n', ' '),
                df_jobMesssage['福利'] = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ',
                                                                                                              '').replace(
                    '\n', ' '),
                # print(df_jobMesssage)
                df_jobMesssage.to_csv('job.csv', mode='a+', header=None, index=True, encoding='utf-8-sig', sep=',')
                df = pd.concat([df, df_jobMesssage], axis=0)
                df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
                print(str(i), '公司信息写入正常')
            except:
                print(str(i), '公司信息写入异常')
    return df
def jobRequire(html):
    # df = pd.DataFrame()
    df = {}  # 定义字典
    # # url='https://www.zhipin.com/job_detail/c3aea253a5b3b2501nJ92d-9GFBR.html'
    # url='https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html'
    # # url='https://www.zhipin.com/job_detail/1635c904e28317c31nN63ti0FlJY.html'
    # cookie = 'Cookie: __guid=95203226.4063907470298592000.1630401055947.081; _bl_uid=tIkzmsaaz8bup1qepsempvm87k3z; wt2=Dt6B1sNjfS9mOw2rOUcWz7LnE65oG5AcG7C-7iuSGQ10DZgwjtuGdrBZlKOJt5QsEu8DWRIOSeNQ2a7qP7q1yRQ~~; lastCity=101210100; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1630888771,1632789052,1632907583,1632959098; acw_tc=0bdd34ba16329610479403976e01a46b6a653805d48cc356c7a1254d2d5375; __c=1632959098; __a=66278464.1630401067.1632907554.1632959098.52.6.7.47; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1632962530; __zp_stoken__=0138dGiMjNjETJHpLDRQ2VDBYbnMRPGxPGRFeJC8TJ0Y%2FASEDIHMxYwBwZi8AHjN%2BTxwJVQgkUkJCHRMVQ3ACZm0YMWV2U1EgOHM5WnAVdzxse017agxTPj5JZUd4Q1w1DSU7fXVbUEcKIRY%3D; __zp_sseed__=iVynj0LLIRVDsqGYxrY8A2rJBiqFMuzEYl1KvBTzD1Q=; __zp_sname__=e948d594; __zp_sts__=1632962688132; monitor_count=40'
    # # cookie ='Cookie: HMACCOUNT_BFESS=399A131593FFAEE5; BDUSS_BFESS=VpjS3U5Q1hQd3ktdkMwand3N3k1ekppN1FJSUhSc2EtdVBEMGhBaU0zSEdYbEpoRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMbRKmHG0SphW; BAIDUID_BFESS=DA74B922ACBBFCBDF71367A36C973898:FG=1'
    # # cookie ='set-cookie: __zp_sseed__=iVynj0LLIRVDsqGYxrY8A7QRlGL1xd7z8VDrvc0yURg=; Path=/; Domain=.zhipin.com'
    # headers = {
    #     'user-agent': 'user-agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
    #     'Cookie': cookie,
    #     'Connection': 'keep - alive',
    #     'Accept':'Accept: image / avif, image / webp, image / apng, image / *, * / *;q = 0.8',
    # }
    # # 新闻链接
    # # session = requests.session()
    # res = requests.get(url=url, headers=headers, timeout=30)
    # res.encoding = 'utf-8'
    # res.raise_for_status()
    # res.encoding = res.apparent_encoding
    # html = BeautifulSoup(res.text, 'html.parser')
    # time.sleep(3)
    # print(html)
    # # 存入本地
    # with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
    #     f.write(res.text)
    # with open('jobhtmlText.html', 'r', encoding='utf-8') as f:
    #     html = BeautifulSoup(f, 'html.parser')
    # html.list = html.find_all('div', attrs={'class': 'job-detail'})  # 整体框架
    for i, item in enumerate(html):
        # print(item,1,sep=',')
        item.list = item.find_all('div', attrs={'class': 'text'})[0].text.strip('').replace(' ', '')
        print(item.list, i, sep=',')
        try:
            df['招聘要求'] = item.find_all('div', attrs={'class': 'text'})[0].text.strip('\n').replace(' ', '').replace(
                '\n', ' ').replace('\r', ' ').replace('\t', ' '),  # 上级框架,
            df['公司信息'] = item.find_all('div', attrs={'class': 'job-sec company-info'})[0].text.strip('\n').replace(' ',
                                                                                                                   ''),
            # df.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
            # df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
            # print(df)
            print(str(i), '招聘职位写入正常')
        except:
            print(str(i), '招聘职位写入异常')

    return df
class writeExcel:
    def __init__(self, data):
        self.data = data
        # print(data)

    def wE_r(self):
        app = xw.App(visible=False, add_book=False)
        new_workbook = xw.Book()
        new_worksheet = new_workbook.sheets.add('worksheet')
        app.display_alerts = False
        app.screen_updating = False
        title = ["序号", "岗位名称", "发布日期", "薪资", "工作地及要求", "公司名称", "公司规模", "所属行业", "招聘职位网址", "招聘要求",
                 "招聘公司网址", "公司信息", '福利', '关键字', '薪资范围', '标记', '顺序', '记录日期','是否投递']
        new_worksheet['A1'].value = title
        new_worksheet.range('l:l').row_height=20
        new_worksheet.range('l:l').column_width=11

        for i in range(len(self.data)):
            try:
                # df_w = jobRequire(data[i]['招聘职位网址'])
                # print(data[i]['招聘职位网址'])
                new_worksheet.cells[i + 1, 0].value = i + 1
                new_worksheet.cells[i + 1, 1].value = data[i]['岗位名称']
                new_worksheet.cells[i + 1, 2].value = ''  # data[i]['发布日期']
                new_worksheet.cells[i + 1, 3].value = data[i]['薪资']
                new_worksheet.cells[i + 1, 4].value = data[i]['工作地及要求']
                new_worksheet.cells[i + 1, 5].value = data[i]['公司名称']
                new_worksheet.cells[i + 1, 6].value = ''  # data[i]['公司规模']
                new_worksheet.cells[i + 1, 7].value = ''  # data[i]['所属行业']
                new_worksheet.cells[i + 1, 8].value = data[i]['招聘职位网址']
                # new_worksheet.cells[i + 1, 9].value =df_w['招聘要求']#str(df_w['招聘要求'].values).strip("['").strip("']").strip('')
                new_worksheet.cells[i + 1, 10].value = ''  # data[i]['招聘公司网址']
                # new_worksheet.cells[i + 1, 11].value = df_w['公司信息']#str(df_w['公司信息'].values).strip("['").strip("']").strip('')
                new_worksheet.cells[i + 1, 12].value = ''  # data[i]['福利']
                # 修改项目
                new_worksheet.cells[i + 1, 13].value = data[i]["关键字"] #key  # 关键字
                new_worksheet.cells[i + 1, 14].value = '20-30k' if salary == '6' else '15-20K'  # 薪资范围
                new_worksheet.cells[i + 1, 17].value = datetime.date.today()  # 薪资范围

                print(str(i), 'Excel数据写入正常')
            except:
                print(str(i), 'Excel数据写入异常')
        '''
        # 招聘公司信息获取
        for i in range(len(self.data)):
            try:
                # 招聘公司信息获取
                time1 = time.time()  # 计算时长
                myWeb = Web(url)  # 实例化类
                time.sleep(0.5+1)
                html = myWeb.web_a(data[i]['招聘职位网址'],i)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
                df_w = jobRequire(html)  # 获取职位需求信息
                print(df_w)
                time.sleep(2)
                new_worksheet.cells[i + 1, 9].value = df_w['招聘要求']
                new_worksheet.cells[i + 1, 11].value = df_w['公司信息']
                print(str(i), 'Excel数据-2模块写入正常')
                time2 = time.time()  # 计算时长
                print('总耗时:{}'.format(time2 - time1))
            except:
                print(str(i), 'Excel数据-2模块写入异常')
        '''
        # new_worksheet.autofit()
        new_workbook.save('jobBoss.xlsx')
        new_workbook.close()
        app.quit()

    def run(self):
        pf = multiprocessing.Process(target=self.wE_r())
        pf.start()
        pf.join()

# 单独写入excl
def write_only():
    app = xw.App(visible=True, add_book=False)
    wb = app.books.open('职业发展-only.xlsx')
    # 创建一个worksheet
    sh = wb.sheets['Boss']
    rng_jobRequire = [i for i in sh.range("i:i").value if i != None]  # 单元格内容
    j = sh.range('a1').expand('table').rows.count  # 序号
    app.display_alerts = False
    # app.screen_updating = False
    myWeb = Web(url)  # 实例化类
    for i in range(len(rng_jobRequire) - 1):
        try:
            #写入招聘要求和公司信息
            ''''''
            try:
                html = myWeb.web_a(rng_jobRequire[i + 1],i) # 获取招聘要求信息
                print(rng_jobRequire[i + 1])
                df_w = jobRequire(html)
                time.sleep(1.5)
                print(df_w)
                # print(df4.index)
                # print(df4.iloc[0,0])
                sh.cells[i + 1, 9].value = df_w['招聘要求']
                sh.cells[i + 1, 11].value = df_w['公司信息']
                time.sleep(1.5)
            except:
                print(str(i), "数据查询错误")
            ''''''
            # 点击沟通
            '''
            driver.find_element_by_xpath(
                ".//div[contains(@class,'job-op')]/div[contains(@class,'btn-container')]/a[contains(@class,'btn btn-startchat')]").click()
            # print(str(i), '招聘要求写入正常')
            time.sleep(1.5)
            sh.cells[i + 1, 18].value = datetime.date.today()
            print(str(i), '已沟通')
            '''
        except:
            print(str(i), "数据查询错误")
            print(str(i), '沟通失败')
            sh.cells[i + 1, 18].value ='沟通失败'

    # sh.autofit()
    wb.save('职业发展-only.xlsx')
    wb.close()
    app.quit()

class Web:

    def __init__(self, url):
        self.url = url
    # 获取招聘职位信息
    def web(self):
        driver.get('https://www.baidu.com/')  # 加载网址
        driver.refresh()
        driver.get('https://www.baidu.com/')  # 加载网址
        driver.back()
        driver.refresh()
        time.sleep(0.5)
        driver.get(self.url)  # 加载网址
        time.sleep(1)
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        html.list = html.find_all('div', attrs={'class': 'job-list'})
        # with open('jobhtml.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)
        return html.list

    # 获取招聘要求和公司信息
    def web_a(self, url,i):

        driver.get('https://www.baidu.com/')  # 加载网址
        driver.refresh()
        driver.get('https://www.baidu.com/')  # 加载网址
        driver.back()
        # driver.refresh()
        # print('回退刷新')
        time.sleep(0.5)
        driver.get(url)  # 加载网址
        # driver.refresh()
        # print('刷新')
        time.sleep(1)
        #隔8次刷新一次
        # if i%8==7:time.sleep(3),print('刷新{}次'.format(i))# driver.refresh(),
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        html.list = html.find_all('div', attrs={'class': 'job-detail'})  # 整体框架
        # with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)
        print('当前运行条目数',i)
        return html.list
df = pd.DataFrame()  # 定义 salary   全局变量
keys1 = ['物流','运输','仓储','配送']
keys2=['物流经理','物流总监','物流运营']
keys3 = ['产品','数据','运营','供应链']
keys4=['人工智能','大数据']
keys_test=['物流']
# #'运营'#【物流#运输#仓储|产品#数据#运营|物流经理#物流总监#物流运营#】
# 物流经理#物流运营#物流管理#物流总监#供应链#运营经理【#产品#运营#数据#资产|#物流#运输#仓储#配送#运力#】
key=keys4[1]
print('关键字:',key)
job_url_key=2#选择是否带行业1带行业2不带行业

salary = '6'  # 5表示15-20K,6表示20-30k

if __name__ == '__main__':
    # jobMesssage()
    # jobRequire()
    # opt = ChromeOptions()  # 创建chrome参数
    # opt.headless = False  # 显示浏览器
    # driver = Chrome(options=opt)  # 浏览器实例化
    # # driver=webdriver.Chrome()
    # driver.set_window_size(300, 700)
    # url='https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='
    #'https://m.zhipin.com/job_detail/?city=101280600&source=10&query=%E6%9D%AD%E5%B7%9E'
    # url_b='https://www.zhipin.com/job_detail/63a31859fef2dbbc1nJy0tS8EFJY.html'
    # # 招聘公司信息获取
    # myWeb = Web(url)  # 实例化类
    # time.sleep(0.3)
    # html = myWeb.web_a(url_b)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
    # df5 = jobRequire(html)  # 获取职位需求信息
    # print(df5)
    # time.sleep(0.5)

    ''''''
    #Google浏览器
    opt = ChromeOptions()  # 创建chrome参数
    # 不加载图片
    # prefs = {"profile.managed_default_content_settings.images": 2}
    # opt.add_experimental_option("prefs", prefs)
    # 超时不加载
    opt.page_load_strategy = 'eager'
    opt.headless = False  # 显示浏览器
    driver = Chrome(options=opt)  # 浏览器实例化
    # 最大化当前页
    driver.maximize_window()
    # driver=webdriver.Chrome()
    # driver.set_window_size(300, 700)
    ''''''
    '''
    #火狐浏览器
    opt = FirefoxOptions()  # ChromeOptions()
    #加载图片632
    # opt.headless = False  # 显示浏览器
    # driver = webdriver.Firefox(options=opt)  # Chrome(options=opt)  # 浏览器实例化
    # # driver.set_window_size(400, 900)
    # 不加载图片
    # opt.set_preference('permissions.default.image', 2)
    opt.headless = False  # 显示浏览器
    driver = webdriver.Firefox(options=opt)  # Chrome(options=opt)  # 浏览器实例化
    '''
    # 扫描的登录
    url = 'https://www.zhipin.com/i100502-c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'
    driver.get(url)  # 加载网址
    time.sleep(2)
    driver.find_element_by_xpath(
        "//div[@class='btns']/a[@ka='header-login']").click()  # 点击账户登录
    time.sleep(5)
    '''
    # 加入数组
    keys = keys3 + keys4   # keys_test#keys3.extend(keys2)#确定数组序列keys=np.vstack(keys4)
    print('关键字:', keys)
    job_url_key = 1  # 选择是否带行业1带行业2不带行业
    df_key = pd.DataFrame()
    for j in range(len(keys)):
        key = keys[j]

        for i in range(6):  # +str(i);key=
            try:
                print(str(i), '获取第{}页数据'.format(i + 1))
                #物流行业
                job_url1 = 'https://www.zhipin.com/i100502-c101210100/y_' + salary + '/?query=' + key + '&city=101210100&industry=&position=&ka=sel-salary-' + salary + '&page=' + str(
                    i + 1) + '&ka=page-' + str(i + 1)#物流行业
                      #'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81&ka=sel-salary-6'
                # 不限行业
                job_url2 = 'https://www.zhipin.com/c101210100/y_' + salary + '/?query=' + key + '&city=101210100&position=&ka=sel-salary-' + salary + '&page=' + str(
                    i + 1) + '&ka=page-' + str(i + 1)#不限行业
                # 选择是否带行业
                if job_url_key == 1:
                    url = job_url1
                else:
                    url = job_url2

                # 'https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81&city=101210100&industry=&position='
                # 'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81&ka=sel-salary-6'
                # 'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81&city=101210100&position=&ka=sel-salary-6&page=1&ka=page-1'
                print(url)
                # 'https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='
                # 'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'
                # 'https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-5'
                # ‘https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&page=2&ka=page-2’
                time1 = time.time()  # 计算时长
                # 获取招聘职位信息
                myWeb = Web(url)
                html = myWeb.web()  # 获取招聘岗位信息
                # html=myWeb.web_a('https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html')# 获取招聘要求和公司信息
                time.sleep(0.5+1)
                # print(html)
                df1 = jobMesssage(html)
                # 加入关键字
                df_key["关键字"] = key,
                # df2=df_key["关键字"]
                df3 = pd.concat([df1, df_key], axis=1)
                df = pd.concat([df, df3], axis=0)
                # df = pd.concat([df1, df], axis=0)
                df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
                # url_b = str(df1['招聘公司网址'].values).strip("['").strip("']").strip('')
                # print(url_b)

                # # 招聘公司信息获取
                # myWeb = Web(url)  # 实例化类
                # time.sleep(0.3)
                # html = myWeb.web_a(url_b)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
                # df2 = jobRequire(html)  # 获取职位需求信息
                # print(df2)
                # time.sleep(0.5)
                #
                # df3 = pd.concat([df1, df2], axis=1)
                # df3.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
                # df = pd.concat([df, df3], axis=0)
                # print(df)
                # df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
                # time.sleep(0.5)

                time2 = time.time()  # 计算时长
                print(str(i), '数据正常'.format(i + 1))
                print('总耗时:{}'.format(time2 - time1))
            except:
                print(str(i), '数据异常'.format(i + 1))
    ''''''
    ''''''
    # 写入excel
    with open('jobBoss.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
        # print(data)
        myWe = writeExcel(data)  # 写入excel
        myWe.run()  # 执行多线程
    '''

    ''''''
    #单独写入excel+批量沟通
    write_only()
    ''''''

    try:  # 关闭后台浏览器
        driver.close()
        driver.quit()
        os.system('taskkill /F /IM chromedriver.exe')  # 关闭进程浏览器
        sreach_windows = driver.current_window_handle
        # 获得当前所有打开的窗口的句柄
        all_handles = driver.window_handles
        for handle in all_handles:
            driver.switch_to.window(handle)
            driver.close()
            time.sleep(1.2)
    except:
        print('已完后台毕浏览器')

boss完整代码2-单独提取职位要求和公司信息:

import requests
from bs4 import BeautifulSoup
import datetime
import json
import xlwings as xw
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions, ActionChains
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.keys import Keys
import csv
import multiprocessing
import os


def jobMesssage(html):
    df_jobMesssage = pd.DataFrame()
    df = pd.DataFrame()
    # with open('jobhtml.html', 'r', encoding='utf-8') as f:
    #     html = BeautifulSoup(f, 'html.parser')
    # html.list = html.find_all('div', attrs={'class': 'job-list'})
    # print(html.list)
    for i, item in enumerate(html):
        item.list = item.find_all('div', attrs={'class': 'job-primary'})
        # print(item,i,sep=',')
        for i, item in enumerate(item.list):  # 获取每个招聘条目
            # print(item, i, sep=',')
            try:
                item.list = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ', '').replace(
                    '\n', ' ')
                print(item.list, i, sep=',')

                df_jobMesssage['招聘职位网址'] = 'https://www.zhipin.com' + item.find('div',
                                                                                attrs={'class': 'primary-box'}).get(
                    'href'),
                df_jobMesssage['岗位名称'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={
                    'class': 'job-name'}).text,
                df_jobMesssage['工作地及要求'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={
                    'class': 'job-area-wrapper'}).text.strip('\n'),  #
                df_jobMesssage['公司名称'] = item.find('div', attrs={'class': 'info-company'}).text.replace(' ',
                                                                                                        '').replace(
                    '\n', ' '),
                df_jobMesssage['薪资'] = item.find('div', attrs={'class': 'job-limit clearfix'}).text.strip('').replace(
                    '\n', ' '),
                df_jobMesssage['福利'] = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ',
                                                                                                              '').replace(
                    '\n', ' '),
                # print(df_jobMesssage)
                df_jobMesssage.to_csv('job.csv', mode='a+', header=None, index=True, encoding='utf-8-sig', sep=',')
                df = pd.concat([df, df_jobMesssage], axis=0)
                df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
                print(str(i), '公司信息写入正常')
            except:
                print(str(i), '公司信息写入异常')
    return df
def jobRequire(html):
    # df = pd.DataFrame()
    df = {}  # 定义字典
    # # url='https://www.zhipin.com/job_detail/c3aea253a5b3b2501nJ92d-9GFBR.html'
    # url='https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html'
    # # url='https://www.zhipin.com/job_detail/1635c904e28317c31nN63ti0FlJY.html'
    # cookie = 'Cookie: __guid=95203226.4063907470298592000.1630401055947.081; _bl_uid=tIkzmsaaz8bup1qepsempvm87k3z; wt2=Dt6B1sNjfS9mOw2rOUcWz7LnE65oG5AcG7C-7iuSGQ10DZgwjtuGdrBZlKOJt5QsEu8DWRIOSeNQ2a7qP7q1yRQ~~; lastCity=101210100; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1630888771,1632789052,1632907583,1632959098; acw_tc=0bdd34ba16329610479403976e01a46b6a653805d48cc356c7a1254d2d5375; __c=1632959098; __a=66278464.1630401067.1632907554.1632959098.52.6.7.47; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1632962530; __zp_stoken__=0138dGiMjNjETJHpLDRQ2VDBYbnMRPGxPGRFeJC8TJ0Y%2FASEDIHMxYwBwZi8AHjN%2BTxwJVQgkUkJCHRMVQ3ACZm0YMWV2U1EgOHM5WnAVdzxse017agxTPj5JZUd4Q1w1DSU7fXVbUEcKIRY%3D; __zp_sseed__=iVynj0LLIRVDsqGYxrY8A2rJBiqFMuzEYl1KvBTzD1Q=; __zp_sname__=e948d594; __zp_sts__=1632962688132; monitor_count=40'
    # # cookie ='Cookie: HMACCOUNT_BFESS=399A131593FFAEE5; BDUSS_BFESS=VpjS3U5Q1hQd3ktdkMwand3N3k1ekppN1FJSUhSc2EtdVBEMGhBaU0zSEdYbEpoRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMbRKmHG0SphW; BAIDUID_BFESS=DA74B922ACBBFCBDF71367A36C973898:FG=1'
    # # cookie ='set-cookie: __zp_sseed__=iVynj0LLIRVDsqGYxrY8A7QRlGL1xd7z8VDrvc0yURg=; Path=/; Domain=.zhipin.com'
    # headers = {
    #     'user-agent': 'user-agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
    #     'Cookie': cookie,
    #     'Connection': 'keep - alive',
    #     'Accept':'Accept: image / avif, image / webp, image / apng, image / *, * / *;q = 0.8',
    # }
    # # 新闻链接
    # # session = requests.session()
    # res = requests.get(url=url, headers=headers, timeout=30)
    # res.encoding = 'utf-8'
    # res.raise_for_status()
    # res.encoding = res.apparent_encoding
    # html = BeautifulSoup(res.text, 'html.parser')
    # time.sleep(3)
    # print(html)
    # # 存入本地
    # with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
    #     f.write(res.text)
    # with open('jobhtmlText.html', 'r', encoding='utf-8') as f:
    #     html = BeautifulSoup(f, 'html.parser')
    # html.list = html.find_all('div', attrs={'class': 'job-detail'})  # 整体框架
    for i, item in enumerate(html):
        # print(item,1,sep=',')
        item.list = item.find_all('div', attrs={'class': 'text'})[0].text.strip('').replace(' ', '')
        print(item.list, i, sep=',')
        try:
            df['招聘要求'] = item.find_all('div', attrs={'class': 'text'})[0].text.strip('\n').replace(' ', '').replace(
                '\n', ' ').replace('\r', ' ').replace('\t', ' '),  # 上级框架,
            df['公司信息'] = item.find_all('div', attrs={'class': 'job-sec company-info'})[0].text.strip('\n').replace(' ',
                                                                                                                   ''),
            # df.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
            # df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
            # print(df)
            print(str(i), '招聘职位写入正常')
        except:
            print(str(i), '招聘职位写入异常')

    return df
class writeExcel:
    def __init__(self, data):
        self.data = data
        # print(data)

    def wE_r(self):
        app = xw.App(visible=False, add_book=False)
        new_workbook = xw.Book()
        new_worksheet = new_workbook.sheets.add('worksheet')
        app.display_alerts = False
        app.screen_updating = False
        title = ["序号", "岗位名称", "发布日期", "薪资", "工作地及要求", "公司名称", "公司规模", "所属行业", "招聘职位网址", "招聘要求",
                 "招聘公司网址", "公司信息", '福利', '关键字', '薪资范围', '标记', '顺序', '记录日期','是否投递']
        new_worksheet['A1'].value = title
        new_worksheet.range('l:l').row_height=20
        new_worksheet.range('l:l').column_width=11

        for i in range(len(self.data)):
            try:
                # df_w = jobRequire(data[i]['招聘职位网址'])
                # print(data[i]['招聘职位网址'])
                new_worksheet.cells[i + 1, 0].value = i + 1
                new_worksheet.cells[i + 1, 1].value = data[i]['岗位名称']
                new_worksheet.cells[i + 1, 2].value = ''  # data[i]['发布日期']
                new_worksheet.cells[i + 1, 3].value = data[i]['薪资']
                new_worksheet.cells[i + 1, 4].value = data[i]['工作地及要求']
                new_worksheet.cells[i + 1, 5].value = data[i]['公司名称']
                new_worksheet.cells[i + 1, 6].value = ''  # data[i]['公司规模']
                new_worksheet.cells[i + 1, 7].value = ''  # data[i]['所属行业']
                new_worksheet.cells[i + 1, 8].value = data[i]['招聘职位网址']
                # new_worksheet.cells[i + 1, 9].value =df_w['招聘要求']#str(df_w['招聘要求'].values).strip("['").strip("']").strip('')
                new_worksheet.cells[i + 1, 10].value = ''  # data[i]['招聘公司网址']
                # new_worksheet.cells[i + 1, 11].value = df_w['公司信息']#str(df_w['公司信息'].values).strip("['").strip("']").strip('')
                new_worksheet.cells[i + 1, 12].value = ''  # data[i]['福利']
                # 修改项目
                new_worksheet.cells[i + 1, 13].value = key  # 关键字
                new_worksheet.cells[i + 1, 14].value = '20-30k' if salary == '6' else '15-20K'  # 薪资范围
                new_worksheet.cells[i + 1, 17].value = datetime.date.today()  # 薪资范围

                print(str(i), 'Excel数据写入正常')
            except:
                print(str(i), 'Excel数据写入异常')
        '''
        # 招聘公司信息获取
        for i in range(len(self.data)):
            try:
                # 招聘公司信息获取
                time1 = time.time()  # 计算时长
                myWeb = Web(url)  # 实例化类
                time.sleep(0.5+1)
                html = myWeb.web_a(data[i]['招聘职位网址'],i)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
                df_w = jobRequire(html)  # 获取职位需求信息
                print(df_w)
                time.sleep(2)
                new_worksheet.cells[i + 1, 9].value = df_w['招聘要求']
                new_worksheet.cells[i + 1, 11].value = df_w['公司信息']
                print(str(i), 'Excel数据-2模块写入正常')
                time2 = time.time()  # 计算时长
                print('总耗时:{}'.format(time2 - time1))
            except:
                print(str(i), 'Excel数据-2模块写入异常')
        '''
        # new_worksheet.autofit()
        new_workbook.save('jobBoss.xlsx')
        new_workbook.close()
        app.quit()

    def run(self):
        pf = multiprocessing.Process(target=self.wE_r())
        pf.start()
        pf.join()

# 单独写入excl
def write_only():
    app = xw.App(visible=True, add_book=False)
    wb = app.books.open('职业发展-only.xlsx')
    # 创建一个worksheet
    sh = wb.sheets['Boss']
    rng_jobRequire = [i for i in sh.range("i:i").value if i != None]  # 单元格内容
    j = sh.range('a1').expand('table').rows.count  # 序号
    app.display_alerts = False
    # app.screen_updating = False
    myWeb = Web(url)  # 实例化类
    for i in range(len(rng_jobRequire) - 1):
        try:
            html = myWeb.web_a(rng_jobRequire[i + 1],i) # 获取招聘要求信息
            print(rng_jobRequire[i + 1])
            df_w = jobRequire(html)
            print(df_w)
            # print(df4.index)
            # print(df4.iloc[0,0])
            sh.cells[i + 1, 9].value = df_w['招聘要求']
            sh.cells[i + 1, 11].value = df_w['公司信息']
            # print(str(i), '招聘要求写入正常')

        except:
            print(str(i), "数据查询错误")

    # sh.autofit()
    wb.save('职业发展-only.xlsx')
    wb.close()
    app.quit()
class Web:

    def __init__(self, url):
        self.url = url

    # 获取招聘职位信息
    def web(self):
        # driver.get('https://www.baidu.com/')  # 加载网址
        # driver.refresh()
        # driver.get('https://www.baidu.com/')  # 加载网址
        # driver.back()
        # driver.refresh()
        # time.sleep(0.5+0.5)
        driver.get(self.url)  # 加载网址
        time.sleep(1.5+0.5+0.5+0.5)
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        html.list = html.find_all('div', attrs={'class': 'job-list'})
        # with open('jobhtml.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)
        return html.list

        # 获取招聘要求和公司信息


    def web_a(self, url,i):

        driver.get('https://www.baidu.com/')  # 加载网址
        driver.refresh()
        driver.get('https://www.baidu.com/')  # 加载网址
        driver.back()
        # driver.refresh()
        # print('回退刷新')
        time.sleep(0.5+1)
        driver.get(url)  # 加载网址
        # driver.refresh()
        # print('刷新')
        time.sleep(1)
        #隔8次刷新一次
        if i%8==7:time.sleep(13),print('刷新{}次'.format(i))# driver.refresh(),

        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        html.list = html.find_all('div', attrs={'class': 'job-detail'})  # 整体框架
        # with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)

        print('当前运行条目数',i)
        return html.list
df = pd.DataFrame()  # 定义 salary   全局变量
key = '物流运营'  # 物流经理#物流运营#物流管理【#物流#运营#数据#运输#仓储#配送】
salary = '6'  # 5表示15-20K,6表示20-30k

if __name__ == '__main__':
    # jobMesssage()
    # jobRequire()
    # opt = ChromeOptions()  # 创建chrome参数
    # opt.headless = False  # 显示浏览器
    # driver = Chrome(options=opt)  # 浏览器实例化
    # # driver=webdriver.Chrome()
    # driver.set_window_size(300, 700)
    # url='https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='
    #'https://m.zhipin.com/job_detail/?city=101280600&source=10&query=%E6%9D%AD%E5%B7%9E'
    # url_b='https://www.zhipin.com/job_detail/63a31859fef2dbbc1nJy0tS8EFJY.html'
    # # 招聘公司信息获取
    # myWeb = Web(url)  # 实例化类
    # time.sleep(0.3)
    # html = myWeb.web_a(url_b)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
    # df5 = jobRequire(html)  # 获取职位需求信息
    # print(df5)
    # time.sleep(0.5)

    opt = ChromeOptions()  # 创建chrome参数
    # 不加载图片
    # prefs = {"profile.managed_default_content_settings.images": 2}
    # opt.add_experimental_option("prefs", prefs)
    opt.headless = False  # 显示浏览器
    driver = Chrome(options=opt)  # 浏览器实例化
    # driver=webdriver.Chrome()
    # driver.set_window_size(300, 700)

    '''
    opt = FirefoxOptions()  # ChromeOptions()
    #加载图片
    # opt.headless = False  # 显示浏览器
    # driver = webdriver.Firefox(options=opt)  # Chrome(options=opt)  # 浏览器实例化
    # # driver.set_window_size(400, 900)

    # 不加载图片
    opt.set_preference('permissions.default.image', 2)
    opt.headless = False  # 显示浏览器
    driver = webdriver.Firefox(options=opt)  # Chrome(options=opt)  # 浏览器实例化
    '''

    # url = 'https://www.zhipin.com/i100502-c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'
    ''''''
    for i in range(6):  # +str(i);key=
        try:
            print(str(i), '获取第{}页数据'.format(i + 1))
            # url = 'https://www.zhipin.com/i100502-c101210100/y_' + salary + '/?query=' + key + '&city=101210100&industry=&position=&ka=sel-salary-' + salary + '&page=' + str(
            #     i + 1) + '&ka=page-' + str(i + 1)#物流行业


            url = 'https://www.zhipin.com/c101210100/y_' + salary + '/?query=' + key + '&city=101210100&position=&ka=sel-salary-' + salary + '&page=' + str(
                i + 1) + '&ka=page-' + str(i + 1)#不限行业

            # 'https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81&city=101210100&industry=&position='
            # 'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81&ka=sel-salary-6'
            # 'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81&city=101210100&position=&ka=sel-salary-6&page=1&ka=page-1'

            print(url)
            # 'https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='
            # 'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'
            # 'https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-5'
            # ‘https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&page=2&ka=page-2’
            time1 = time.time()  # 计算时长
            # 获取招聘职位信息
            myWeb = Web(url)
            html = myWeb.web()  # 获取招聘岗位信息
            # html=myWeb.web_a('https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html')# 获取招聘要求和公司信息
            time.sleep(0.5+1+1+0.5+0.5+0.5+1+0.5)
            # print(html)
            df1 = jobMesssage(html)
            df = pd.concat([df1, df], axis=0)
            df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
            # url_b = str(df1['招聘公司网址'].values).strip("['").strip("']").strip('')
            # print(url_b)
            # # 招聘公司信息获取
            # myWeb = Web(url)  # 实例化类
            # time.sleep(0.3)
            # html = myWeb.web_a(url_b)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
            # df2 = jobRequire(html)  # 获取职位需求信息
            # print(df2)
            # time.sleep(0.5)
            #
            # df3 = pd.concat([df1, df2], axis=1)
            # df3.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
            # df = pd.concat([df, df3], axis=0)
            # print(df)
            # df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
            # time.sleep(0.5)
            time2 = time.time()  # 计算时长
            print(str(i), '数据正常'.format(i + 1))
            print('总耗时:{}'.format(time2 - time1))
        except:
            print(str(i), '数据异常'.format(i + 1))

    # 写入excel
    with open('jobBoss.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
        # print(data)
        myWe = writeExcel(data)  # 写入excel
        myWe.run()  # 执行多线程
    ''''''

    '''
    #单独写入excel
    write_only()
    '''

    try:  # 关闭后台浏览器
        driver.close()
        driver.quit()
        os.system('taskkill /F /IM chromedriver.exe')  # 关闭进程浏览器
        sreach_windows = driver.current_window_handle
        # 获得当前所有打开的窗口的句柄
        all_handles = driver.window_handles
        for handle in all_handles:
            driver.switch_to.window(handle)
            driver.close()
            time.sleep(1.2)
    except:
        print('已完后台毕浏览器')

对应招聘信息单独查询写入excel

import requests
from bs4 import BeautifulSoup
import datetime
import json
import xlwings as xw
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions, ActionChains
from selenium.webdriver.common.keys import Keys
import csv
import multiprocessing
import os


def jobMesssage(html):
    df_jobMesssage = pd.DataFrame()
    df = pd.DataFrame()
    # with open('jobhtml.html', 'r', encoding='utf-8') as f:
    #     html = BeautifulSoup(f, 'html.parser')
    # html.list = html.find_all('div', attrs={'class': 'job-list'})
    # print(html.list)
    for i, item in enumerate(html):
        item.list = item.find_all('div', attrs={'class': 'job-primary'})
        # print(item,i,sep=',')
        for i, item in enumerate(item.list):  # 获取每个招聘条目
            # print(item, i, sep=',')
            try:
                item.list = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ', '').replace(
                    '\n', ' ')
                print(item.list, i, sep=',')

                df_jobMesssage['招聘职位网址'] = 'https://www.zhipin.com' + item.find('div',
                                                                                attrs={'class': 'primary-box'}).get(
                    'href'),
                df_jobMesssage['岗位名称'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={
                    'class': 'job-name'}).text,
                df_jobMesssage['工作地及要求'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={
                    'class': 'job-area-wrapper'}).text.strip('\n'),  #
                df_jobMesssage['公司名称'] = item.find('div', attrs={'class': 'info-company'}).text.replace(' ',
                                                                                                        '').replace(
                    '\n', ' '),
                df_jobMesssage['薪资'] = item.find('div', attrs={'class': 'job-limit clearfix'}).text.strip('').replace(
                    '\n', ' '),
                df_jobMesssage['福利'] = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ',
                                                                                                              '').replace(
                    '\n', ' '),
                # print(df_jobMesssage)
                df_jobMesssage.to_csv('job.csv', mode='a+', header=None, index=True, encoding='utf-8-sig', sep=',')
                df = pd.concat([df, df_jobMesssage], axis=0)
                df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
                print(str(i), '招聘职位写入正常')
            except:
                print(str(i), '招聘职位写入正常')
    return df


def jobRequire(html):
    # df = pd.DataFrame()
    df = {}  # 定义字典
    # # url='https://www.zhipin.com/job_detail/c3aea253a5b3b2501nJ92d-9GFBR.html'
    # url='https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html'
    # # url='https://www.zhipin.com/job_detail/1635c904e28317c31nN63ti0FlJY.html'
    # cookie = 'Cookie: __guid=95203226.4063907470298592000.1630401055947.081; _bl_uid=tIkzmsaaz8bup1qepsempvm87k3z; wt2=Dt6B1sNjfS9mOw2rOUcWz7LnE65oG5AcG7C-7iuSGQ10DZgwjtuGdrBZlKOJt5QsEu8DWRIOSeNQ2a7qP7q1yRQ~~; lastCity=101210100; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1630888771,1632789052,1632907583,1632959098; acw_tc=0bdd34ba16329610479403976e01a46b6a653805d48cc356c7a1254d2d5375; __c=1632959098; __a=66278464.1630401067.1632907554.1632959098.52.6.7.47; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1632962530; __zp_stoken__=0138dGiMjNjETJHpLDRQ2VDBYbnMRPGxPGRFeJC8TJ0Y%2FASEDIHMxYwBwZi8AHjN%2BTxwJVQgkUkJCHRMVQ3ACZm0YMWV2U1EgOHM5WnAVdzxse017agxTPj5JZUd4Q1w1DSU7fXVbUEcKIRY%3D; __zp_sseed__=iVynj0LLIRVDsqGYxrY8A2rJBiqFMuzEYl1KvBTzD1Q=; __zp_sname__=e948d594; __zp_sts__=1632962688132; monitor_count=40'
    # # cookie ='Cookie: HMACCOUNT_BFESS=399A131593FFAEE5; BDUSS_BFESS=VpjS3U5Q1hQd3ktdkMwand3N3k1ekppN1FJSUhSc2EtdVBEMGhBaU0zSEdYbEpoRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMbRKmHG0SphW; BAIDUID_BFESS=DA74B922ACBBFCBDF71367A36C973898:FG=1'
    # # cookie ='set-cookie: __zp_sseed__=iVynj0LLIRVDsqGYxrY8A7QRlGL1xd7z8VDrvc0yURg=; Path=/; Domain=.zhipin.com'
    # headers = {
    #     'user-agent': 'user-agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
    #     'Cookie': cookie,
    #     'Connection': 'keep - alive',
    #     'Accept':'Accept: image / avif, image / webp, image / apng, image / *, * / *;q = 0.8',
    # }
    # # 新闻链接
    # # session = requests.session()
    # res = requests.get(url=url, headers=headers, timeout=30)
    # res.encoding = 'utf-8'
    # res.raise_for_status()
    # res.encoding = res.apparent_encoding
    # html = BeautifulSoup(res.text, 'html.parser')
    # time.sleep(3)
    # print(html)
    # # 存入本地
    # with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
    #     f.write(res.text)
    # with open('jobhtmlText.html', 'r', encoding='utf-8') as f:
    #     html = BeautifulSoup(f, 'html.parser')
    # html.list = html.find_all('div', attrs={'class': 'job-detail'})  # 整体框架
    for i, item in enumerate(html):
        # print(item,1,sep=',')
        item.list = item.find_all('div', attrs={'class': 'text'})[0].text.strip('').replace(' ', '')
        print(item.list, i, sep=',')
        try:
            df['招聘要求'] = item.find_all('div', attrs={'class': 'text'})[0].text.strip('\n').replace(' ', '').replace(
                '\n', ' ').replace('\r', ' ').replace('\t', ' '),  # 上级框架,
            df['公司信息'] = item.find_all('div', attrs={'class': 'job-sec company-info'})[0].text.strip('\n').replace(' ',
                                                                                                                   ''),
            # df.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
            # df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
            # print(df)
            print(str(i), '招聘职位写入正常')
        except:
            print(str(i), '招聘职位写入正常')

    return df


class writeExcel:
    def __init__(self, data):
        self.data = data
        # print(data)

    def wE_r(self):
        app = xw.App(visible=False, add_book=False)
        new_workbook = xw.Book()
        new_worksheet = new_workbook.sheets.add('worksheet')
        app.display_alerts = False
        app.screen_updating = False
        title = ["序号", "岗位名称", "发布日期", "薪资", "工作地及要求", "公司名称", "公司规模", "所属行业", "招聘职位网址", "招聘要求",
                 "招聘公司网址", "公司信息", '福利', '关键字', '薪资范围', '标记', '顺序', '记录日期']
        new_worksheet['A1'].value = title

        for i in range(len(self.data)):
            try:
                # df_w = jobRequire(data[i]['招聘职位网址'])
                # print(data[i]['招聘职位网址'])
                new_worksheet.cells[i + 1, 0].value = i + 1
                new_worksheet.cells[i + 1, 1].value = data[i]['岗位名称']
                new_worksheet.cells[i + 1, 2].value = ''  # data[i]['发布日期']
                new_worksheet.cells[i + 1, 3].value = data[i]['薪资']
                new_worksheet.cells[i + 1, 4].value = data[i]['工作地及要求']
                new_worksheet.cells[i + 1, 5].value = data[i]['公司名称']
                new_worksheet.cells[i + 1, 6].value = ''  # data[i]['公司规模']
                new_worksheet.cells[i + 1, 7].value = ''  # data[i]['所属行业']
                new_worksheet.cells[i + 1, 8].value = data[i]['招聘职位网址']
                # new_worksheet.cells[i + 1, 9].value =df_w['招聘要求']#str(df_w['招聘要求'].values).strip("['").strip("']").strip('')
                new_worksheet.cells[i + 1, 10].value = ''  # data[i]['招聘公司网址']
                # new_worksheet.cells[i + 1, 11].value = df_w['公司信息']#str(df_w['公司信息'].values).strip("['").strip("']").strip('')
                new_worksheet.cells[i + 1, 12].value = ''  # data[i]['福利']
                # 修改项目
                new_worksheet.cells[i + 1, 13].value = key  # 关键字
                new_worksheet.cells[i + 1, 14].value = '20-30k' if salary == '6' else '15-20K'  # 薪资范围
                new_worksheet.cells[i + 1, 17].value = datetime.date.today()  # 薪资范围

                print(str(i), 'Excel数据写入正常')
            except:
                print(str(i), 'Excel数据写入异常')
        # 招聘公司信息获取
        for i in range(len(self.data)):
            try:
                # 招聘公司信息获取
                time1 = time.time()  # 计算时长
                myWeb = Web(url)  # 实例化类
                time.sleep(0.5)
                html = myWeb.web_a(data[i]['招聘职位网址'])  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
                df_w = jobRequire(html)  # 获取职位需求信息
                print(df_w)
                time.sleep(3)
                new_worksheet.cells[i + 1, 9].value = df_w['招聘要求']
                new_worksheet.cells[i + 1, 11].value = df_w['公司信息']
                print(str(i), 'Excel数据-2模块写入正常')
                time2 = time.time()  # 计算时长
                print('总耗时:{}'.format(time2 - time1))
            except:
                print(str(i), 'Excel数据-2模块写入异常')

        new_worksheet.autofit()
        new_workbook.save('jobBoss.xlsx')
        new_workbook.close()
        app.quit()

    def run(self):
        pf = multiprocessing.Process(target=self.wE_r())
        pf.start()
        pf.join()


class Web:
    def __init__(self, url):
        self.url = url

    # 获取招聘职位信息
    def web(self):
        driver.back()
        # driver.refresh()
        time.sleep(0.5)
        driver.get(self.url)  # 加载网址
        time.sleep(1.5)
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        html.list = html.find_all('div', attrs={'class': 'job-list'})
        # with open('jobhtml.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)
        return html.list

        # 获取招聘要求和公司信息

    def web_a(self, url):
        driver.back()
        # driver.refresh()
        print('回退刷新')
        time.sleep(0.5)
        driver.get(url)  # 加载网址
        # driver.refresh()
        # print('刷新')
        time.sleep(2)
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        html.list = html.find_all('div', attrs={'class': 'job-detail'})  # 整体框架
        # with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)
        return html.list


df = pd.DataFrame()  # 定义 salary   全局变量
key = '物流管理'  # 物流经理#物流运营
salary = '5'  # 5表示15-20K,6表示20-30k
if __name__ == '__main__':
    # jobMesssage()
    # jobRequire()
    # opt = ChromeOptions()  # 创建chrome参数
    # opt.headless = False  # 显示浏览器
    # driver = Chrome(options=opt)  # 浏览器实例化
    # # driver=webdriver.Chrome()
    # driver.set_window_size(300, 700)
    # url='https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='
    # url_b='https://www.zhipin.com/job_detail/63a31859fef2dbbc1nJy0tS8EFJY.html'
    # # 招聘公司信息获取
    # myWeb = Web(url)  # 实例化类
    # time.sleep(0.3)
    # html = myWeb.web_a(url_b)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
    # df5 = jobRequire(html)  # 获取职位需求信息
    # print(df5)
    # time.sleep(0.5)

    opt = ChromeOptions()  # 创建chrome参数
    # 不加载图片
    prefs = {"profile.managed_default_content_settings.images": 2}
    opt.add_experimental_option("prefs", prefs)
    opt.headless = False  # 显示浏览器
    driver = Chrome(options=opt)  # 浏览器实例化
    # driver=webdriver.Chrome()
    driver.set_window_size(300, 700)

    url = 'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'

    for i in range(3):  # +str(i);key=
        try:
            print(str(i), '获取第{}页数据'.format(i + 1))
            url = 'https://www.zhipin.com/c101210100/y_' + salary + '/?query=' + key + '&city=101210100&industry=&position=&ka=sel-salary-' + salary + '&page=' + str(
                i + 1) + '&ka=page-' + str(i + 1)
            print(url)
            # 'https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='
            # 'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'
            # 'https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-5'
            # ‘https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&page=2&ka=page-2’
            time1 = time.time()  # 计算时长
            # 获取招聘职位信息
            myWeb = Web(url)
            html = myWeb.web()  # 获取招聘岗位信息
            # html=myWeb.web_a('https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html')# 获取招聘要求和公司信息
            time.sleep(0.5)
            # print(html)
            df1 = jobMesssage(html)
            df = pd.concat([df1, df], axis=0)
            df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
            # url_b = str(df1['招聘公司网址'].values).strip("['").strip("']").strip('')
            # print(url_b)
            # # 招聘公司信息获取
            # myWeb = Web(url)  # 实例化类
            # time.sleep(0.3)
            # html = myWeb.web_a(url_b)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
            # df2 = jobRequire(html)  # 获取职位需求信息
            # print(df2)
            # time.sleep(0.5)
            #
            # df3 = pd.concat([df1, df2], axis=1)
            # df3.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
            # df = pd.concat([df, df3], axis=0)
            # print(df)
            # df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
            # time.sleep(0.5)
            time2 = time.time()  # 计算时长
            print(str(i), '数据正常'.format(i + 1))
            print('总耗时:{}'.format(time2 - time1))
        except:
            print(str(i), '数据异常'.format(i + 1))

    # 写入excel
    with open('jobBoss.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
        # print(data)
        myWe = writeExcel(data)  # 写入excel
        myWe.run()  # 执行多线程

    try:  # 关闭后台浏览器
        driver.close()
        driver.quit()
        os.system('taskkill /F /IM chromedriver.exe')  # 关闭进程浏览器
        sreach_windows = driver.current_window_handle
        # 获得当前所有打开的窗口的句柄
        all_handles = driver.window_handles
        for handle in all_handles:
            driver.switch_to.window(handle)
            driver.close()
            time.sleep(1.2)
    except:
        print('已完后台毕浏览器')

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

品尚公益团队

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值