import re,requests
from lxml import etree
#import pymysql,sys
import csv
import time,random
from fake_useragent import UserAgent
COUNT = 3
def parse(COUNT, header, url):
while COUNT:
try:
response = requests.get(url, headers=header, timeout=20)
if response.status_code == 200:
return response
else:
COUNT -= 1
except:
COUNT -= 1
if COUNT == 0:
return 0
header = {'User-Agent': UserAgent().random, 'Accept-Language': 'zh-CN,zh;q=0.9'}
def save_to_csv(job_name, company_name, company_link,advantage , salary, place,post_time, job_nature,work_experience, education,job_number,job_kind,job_content,job_place,company_info,company_size,company_nature,company_industry,company_home_link,company_place):
row = [job_name, company_name, company_link,advantage , salary, place,post_time, job_nature,work_experience, education,job_number,job_kind,job_content,job_place,company_info,company_size,company_nature,company_industry,company_home_link,company_place]
with open(r'C:\Users\241\Desktop\慕亚东\智联\智联大连pythonjob.csv', 'a', newline='', encoding='utf-8') as file:
f = csv.writer(file)
f.writerow(row)
def get_content(job_url):
# print('*******************'+job_url)
'''获取详细页面的信息'''
global a,count
p=random.randint(1,3)
time.sleep(p)
html=requests.get(job_url,headers=header,timeout=10)
response = etree.HTML(html.content)
link=job_url #职位链接
if u'jobs.zhaopin' in link:
for i in response.xpath('//div[@class="inner-left fl"]'):
job_name = ''.join(i.xpath('h1/text()')) # 职位名称
company_name = ''.join(i.xpath('h2/a/text()')) # 公司名称
company_link =''.join(i.xpath('h2/a/@href')) # 公司链接
advantage = ','.join(i.xpath('div[1]/span/text()')) # 公司福利
for i in response.xpath('//ul[@class="terminal-ul clearfix"]'):
salary = ''.join(i.xpath('li[1]/strong/text()')) # 职位月薪
place = ''.join(i.xpath('li[2]/strong/a/text()')) # 工作地点
post_time = ''.join(i.xpath('li[3]//span[@id="span4freshdate"]/text()')) # 发布日期
job_nature = ''.join(i.xpath('li[4]/strong/text()')) # 工作性质
work_experience = ''.join(i.xpath('li[5]/strong/text()')) # 工作经验
education = ''.join(i.xpath('li[6]/strong/text()') ) # 最低学历
job_number = ''.join(i.xpath('li[7]/strong/text()')) # 招聘人数
job_kind = ''.join(i.xpath('li[8]/strong/a/text()')) # 职位类别
# print('*******************'+job_url)
try:
job_content= ''.join(i.xpath('//div[@class="tab-cont-box"]/div[@class="tab-inner-cont"]//p//text()')).split("\n")[0] # 职位描述
except :
job_content=''
for i in response.xpath('//div[@class="tab-inner-cont"]')[0:1]:
job_place = i.xpath('h2/text()')[0].strip() #工作地点(具体)
for i in response.xpath('//div[@class="tab-inner-cont"]')[1:2]:
reg = re.compile(r'<[^>]+>')
company_content = reg.sub('',i.xpath('string(.)')).replace(' ', '') # 公司的介绍
company_info = company_content
for i in response.xpath('//ul[@class="terminal-ul clearfix terminal-company mt20"]'):
if u'公司主页' in i.xpath('string(.)'):
company_size = ''.join(i.xpath('li[1]/strong/text()'))
company_nature =''.join(i.xpath('li[2]/strong/text()'))
company_industry = ''.join(i.xpath('li[3]/strong/a/text()'))
company_home_link = ''.join(i.xpath('li[4]/strong/a/text()'))
company_place = ''.join(i.xpath('li[5]/strong/text()'))
else:
company_size = ''.join(i.xpath('li[1]/strong/text()'))
company_nature = ''.join(i.xpath('li[2]/strong/text()'))
company_industry = ''.join(i.xpath('li[3]/strong/a/text()'))
company_home_link = [u'无公司主页']
company_place = ''.join(i.xpath('li[4]/strong/text()'))
save_to_csv(job_name, company_name, company_link,advantage , salary, place,post_time, job_nature,work_experience, education,job_number,job_kind,job_content,job_place,company_info,company_size,company_nature,company_industry,company_home_link,company_place)
def get_url():
for page in range(0,10):
print('=========='+str(page)+'=============')
num = page*60
#大连职位链接
url ='https://fe-api.zhaopin.com/c/i/sou?start='+str(num)+'&pageSize=60&cityId=600&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=python&kt=3&lastUrlQuery=%7B%22p%22:2,%22jl%22:%22600%22,%22kw%22:%22python%22,%22kt%22:%223%22%7D'
#全国职位链接
# url1='https://fe-api.zhaopin.com/c/i/sou?start='+str(num)+'&pageSize=60&cityId=489&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=python&kt=3&lastUrlQuery=%7B%22p%22:3,%22jl%22:%22489%22,%22kw%22:%22python%22,%22kt%22:%223%22%7D'
header = {'User-Agent': UserAgent().random, 'Accept-Language': 'zh-CN,zh;q=0.9'}
response = parse(COUNT, header, url)
detail = str(response.content,'utf-8').split("positionURL")
for i in range(1,len(detail)):
# print('*******************'+str(i)+'------'+positionURL)
positionURL = detail[i].split('","')[0].split('":"')[1]
print('*******************'+str(i)+'------'+positionURL)
get_content(positionURL)
# print(positionURL)
if __name__ == '__main__':
get_url()
智联
最新推荐文章于 2024-03-18 09:23:32 发布