# import urllib.request
import re
import xlwt#用来创建excel文档并写入数据
import requests
import csv
from lxml import etree
from fake_useragent import UserAgent
COUNT = 3
def parse(COUNT, header, url):
while COUNT:
try:
response = requests.get(url, headers=header, timeout=20)
if response.status_code == 200:
return response
else:
COUNT -= 1
except:
COUNT -= 1
if COUNT == 0:
return 0
def save_to_csv(url, address, experience,education , need_num, pub_time,salary,fuli,job_info,company_address, company_info):
row = [url, address, experience,education , need_num, pub_time,salary, fuli,job_info,company_address, company_info]
with open(r'C:\Users\241\Desktop\51Job\51job.csv', 'a', newline='', encoding='utf-8') as file:
f = csv.writer(file)
f.writerow(row)
#获取原码
def dd(url,header):
print('***************'+url)
response = parse(COUNT, header, url)
if response:
selector = etree.HTML(response.content)
try:
job_info = ''.join(selector.xpath('//div[@class="bmsg job_msg inbox"]/p//text()'))
except:
try:
job_info = ''.join(selector.xpath('div[@class="bmsg job_msg inbox"]/ol//li//text()'))
except:
try:
job_info = ''.join(selector.xpath('//div[@class="bmsg job_msg inbox"]/text()'))
except:
try:
job_info = ''.join(selector.xpath('//div[@class="gw_yq"]//text()'))
except:
job_info = ''
# print (job_info)
# 需要求信息
try:
require = ''.join(selector.xpath('//p[@class="msg ltype"]//text()')).split("|")
# print(require)
except:
require = ''
try:
company_address = selector.xpath('/html/body/div[3]/div[2]/div[3]/div[2]/div/p/text()')[1]
# .replace()
# .split("|")
# print(company_address)
except:
company_address = ''
# 公司信息
try:
company_info = ''.join(selector.xpath('//div[@class="tmsg inbox"]//text()'))
# print(company_info)
except:
company_info = ''
#工资
try:
salary = ''.join(selector.xpath('//div[@class="cn"]/strong/text()'))
except:
salary = ''
# print("*********************"+salary)
#福利
try:
fuli = ''.join(selector.xpath('//div[@class="t1"]//span//text()'))
except:
fuli = ''
if len(require) ==5:
address = require[0].split("\r\n\t\t\t\t")[1].split("\xa0\xa0")[0]
experience = require[1].split("\xa0\xa0")[1]
education = require[2].split("\xa0\xa0")[1]
need_num = require[3].split("\xa0\xa0")[1]
pub_time = require[4].split("\xa0\xa0")[1].split("\t\t\t")[0]
ability = ''
ability2 = ''
# print(address,experience,education,need_num,pub_time,ability,ability2)
elif len(require) ==4:
address = require[0].split("\r\n\t\t\t\t")[1].split("\xa0\xa0")[0]
experience = require[1].split("\xa0\xa0")[1]
education = ''
need_num = require[2].split("\xa0\xa0")[1]
pub_time = require[3].split("\xa0\xa0")[1].split("\t\t\t")[0]
ability = ''
ability2 = ''
# print(address,experience,education,need_num,pub_time,ability,ability2)
elif len(require) ==6:
address = require[0].split("\r\n\t\t\t\t")[1].split("\xa0\xa0")[0]
experience = require[1].split("\xa0\xa0")[1]
education = require[2].split("\xa0\xa0")[1]
need_num = require[3].split("\xa0\xa0")[1]
pub_time = require[4].split("\xa0\xa0")[1]
ability = require[5].split("\xa0\xa0")[1].split("\t\t\t")[0]
ability2 = ''
# print(address,experience,education,need_num,pub_time,ability,ability2)
elif len(require) ==7:
address = require[0].split("\r\n\t\t\t\t")[1].split("\xa0\xa0")[0]
experience = require[1].split("\xa0\xa0")[1]
education = require[2].split("\xa0\xa0")[1]
need_num = require[3].split("\xa0\xa0")[1]
pub_time = require[4].split("\xa0\xa0")[1]
ability = require[5].split("\xa0\xa0")[1]
ability2 = require[6].split("\xa0\xa0")[1].split("\t\t\t")[0]
print(address,experience,education,need_num,pub_time,ability,ability2)
save_to_csv(url, address, experience,education , need_num, pub_time,salary, fuli,job_info,company_address, company_info)
def get_content(page):
url ='http://search.51job.com/list/000000,000000,0000,00,9,99,python,2,'+ str(page)+'.html'
header = {'User-Agent': UserAgent().random, 'Accept-Language': 'zh-CN,zh;q=0.9'}
response = parse(COUNT, header, url)
selector = etree.HTML(response.content)
url_pages = selector.xpath('//*[@id="resultList"]/div/p/span/a/@href')
for url_page in url_pages:
dd(url_page,header)
if __name__ == '__main__':
get_content(1)
51job
最新推荐文章于 2023-07-11 14:08:33 发布