智联

最新推荐文章于 2024-03-18 09:23:32 发布
qq_42717902
最新推荐文章于 2024-03-18 09:23:32 发布
阅读量482
点赞数 1
分类专栏： python 文章标签：智联
python 专栏收录该内容
8 篇文章 0 订阅
订阅专栏

import re,requests
from lxml import etree
#import pymysql,sys
import csv
import time,random
from fake_useragent import UserAgent
COUNT = 3
def parse(COUNT, header, url):
    while COUNT:
        try:
            response = requests.get(url, headers=header, timeout=20)
            if response.status_code == 200:
                return response
            else:
                COUNT -= 1
        except:
            COUNT -= 1
        if COUNT == 0:
            return 0

header = {'User-Agent': UserAgent().random, 'Accept-Language': 'zh-CN,zh;q=0.9'}

def save_to_csv(job_name, company_name, company_link,advantage , salary, place,post_time, job_nature,work_experience, education,job_number,job_kind,job_content,job_place,company_info,company_size,company_nature,company_industry,company_home_link,company_place):
    row = [job_name, company_name, company_link,advantage , salary, place,post_time, job_nature,work_experience, education,job_number,job_kind,job_content,job_place,company_info,company_size,company_nature,company_industry,company_home_link,company_place]
    with open(r'C:\Users\241\Desktop\慕亚东\智联\智联大连pythonjob.csv', 'a', newline='', encoding='utf-8') as file:
        f = csv.writer(file)
        f.writerow(row)

def get_content(job_url):
#    print('*******************'+job_url)
    '''获取详细页面的信息'''
    global a,count
    p=random.randint(1,3)
    time.sleep(p)
    html=requests.get(job_url,headers=header,timeout=10)
    response = etree.HTML(html.content)
    link=job_url     #职位链接
    if u'jobs.zhaopin' in link:
        for i in response.xpath('//div[@class="inner-left fl"]'):
            job_name = ''.join(i.xpath('h1/text()')) # 职位名称
            company_name = ''.join(i.xpath('h2/a/text()'))  # 公司名称
            company_link =''.join(i.xpath('h2/a/@href'))  # 公司链接
            advantage = ','.join(i.xpath('div[1]/span/text()'))  # 公司福利
        for i in response.xpath('//ul[@class="terminal-ul clearfix"]'):
            salary = ''.join(i.xpath('li[1]/strong/text()')) # 职位月薪
            place = ''.join(i.xpath('li[2]/strong/a/text()')) # 工作地点
            post_time = ''.join(i.xpath('li[3]//span[@id="span4freshdate"]/text()'))  # 发布日期
            job_nature = ''.join(i.xpath('li[4]/strong/text()'))  # 工作性质
            work_experience = ''.join(i.xpath('li[5]/strong/text()')) # 工作经验
            education = ''.join(i.xpath('li[6]/strong/text()') ) # 最低学历
            job_number = ''.join(i.xpath('li[7]/strong/text()'))  # 招聘人数
            job_kind = ''.join(i.xpath('li[8]/strong/a/text()'))  # 职位类别


#        print('*******************'+job_url)

        try:
            job_content=   ''.join(i.xpath('//div[@class="tab-cont-box"]/div[@class="tab-inner-cont"]//p//text()')).split("\n")[0] # 职位描述
        except :
            job_content=''

        for i in response.xpath('//div[@class="tab-inner-cont"]')[0:1]:
            job_place = i.xpath('h2/text()')[0].strip()    #工作地点（具体）

        for i in response.xpath('//div[@class="tab-inner-cont"]')[1:2]:
            reg = re.compile(r'<[^>]+>')
            company_content = reg.sub('',i.xpath('string(.)')).replace('&nbsp', '')  # 公司的介绍
            company_info = company_content

        for i in response.xpath('//ul[@class="terminal-ul clearfix terminal-company mt20"]'):
            if u'公司主页' in i.xpath('string(.)'):
                company_size = ''.join(i.xpath('li[1]/strong/text()'))
                company_nature =''.join(i.xpath('li[2]/strong/text()'))
                company_industry = ''.join(i.xpath('li[3]/strong/a/text()'))
                company_home_link = ''.join(i.xpath('li[4]/strong/a/text()'))
                company_place = ''.join(i.xpath('li[5]/strong/text()'))
            else:
                company_size = ''.join(i.xpath('li[1]/strong/text()'))
                company_nature = ''.join(i.xpath('li[2]/strong/text()'))
                company_industry = ''.join(i.xpath('li[3]/strong/a/text()'))
                company_home_link = [u'无公司主页']
                company_place = ''.join(i.xpath('li[4]/strong/text()'))
        save_to_csv(job_name, company_name, company_link,advantage , salary, place,post_time, job_nature,work_experience, education,job_number,job_kind,job_content,job_place,company_info,company_size,company_nature,company_industry,company_home_link,company_place)


def get_url():
    for page in range(0,10):
        print('=========='+str(page)+'=============')
        num = page*60
#大连职位链接
        url ='https://fe-api.zhaopin.com/c/i/sou?start='+str(num)+'&pageSize=60&cityId=600&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=python&kt=3&lastUrlQuery=%7B%22p%22:2,%22jl%22:%22600%22,%22kw%22:%22python%22,%22kt%22:%223%22%7D'
#全国职位链接  
#        url1='https://fe-api.zhaopin.com/c/i/sou?start='+str(num)+'&pageSize=60&cityId=489&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=python&kt=3&lastUrlQuery=%7B%22p%22:3,%22jl%22:%22489%22,%22kw%22:%22python%22,%22kt%22:%223%22%7D'
        header = {'User-Agent': UserAgent().random, 'Accept-Language': 'zh-CN,zh;q=0.9'}
        response = parse(COUNT, header, url)
        detail = str(response.content,'utf-8').split("positionURL")
        for i in range(1,len(detail)):
#            print('*******************'+str(i)+'------'+positionURL)
            positionURL = detail[i].split('","')[0].split('":"')[1]
            print('*******************'+str(i)+'------'+positionURL)
            get_content(positionURL)
#        print(positionURL)
if __name__ == '__main__':
   get_url()