利用python3.7.2 爬虫51job想找一份离家近的工作

从来没碰过python,从网上找的例子模仿着写了一个 抓取 51job, 武汉地, java招聘的爬虫
参考例子

运行效果:
在这里插入图片描述
数据样式:
在这里插入图片描述

代码:

import urllib
import re, codecs
import time, random
import requests
from lxml import html
from urllib import parse
import xlwt
import traceback

key = 'java'
key = parse.quote(parse.quote(key))
headers = {'Host': 'search.51job.com',
           'Upgrade-Insecure-Requests': '1',
           'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}

def get_links(i):
    url = 'http://search.51job.com/list/180200,000000,0000,00,9,99,' + key + ',2,' + str(i) + '.htmll'
    r = requests.get(url, headers, timeout=10)
    s = requests.session()
    s.keep_alive = False
    r.encoding = 'gbk'
    reg = re.compile(r'class="t1 ">.*? <a target="_blank" title=".*?" href="(.*?)".*? <span class="t2">', re.S)
    reg1 = re.compile(r'class="t1 ">.*? <a target="_blank" title="(.*?)".*?<span class="t2"><a target="_blank" title="(.*?)" href="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*? <span class="t5">(.*?)</span>',re.S)  # 匹配换行符    
    links = re.findall(reg, r.text)
    #print(links)
    return links


# 多页处理,下载到文件
def get_content(link):
    r1 = requests.get(link, headers, timeout=10)
    s = requests.session()
    s.keep_alive = False
    r1.encoding = 'gbk'
    t1 = html.fromstring(r1.text)
    try:
        job = t1.xpath('//div[@class="tHeader tHjob"]//h1/text()')[0].strip()
        company = t1.xpath('//p[@class="cname"]/a/text()')[0].strip()
        #print('工作:', job)
        print('公司:', company)
        area = t1.xpath('//div[@class="tHeader tHjob"]//p[@class="msg ltype"]/text()')[0].strip()
        #print('地区', area)
        workyear = t1.xpath('//div[@class="tHeader tHjob"]//p[@class="msg ltype"]/text()')[1].strip()
        #print('工作经验', workyear)
        education = t1.xpath('//div[@class="tHeader tHjob"]//p[@class="msg ltype"]/text()')[2].strip()
        #print('学历:', education)
        people = t1.xpath('//div[@class="tHeader tHjob"]//p[@class="msg ltype"]/text()')[3].strip()
        #print('人数', people)
        #date = t1.xpath('//div[@class="tHeader tHjob"]//p[@class="msg ltype"]/text()')[4].strip()
        date =link
        #print('发布日期', date)
        # date = t1.xpath('//div[@class="tBorderTop_box"]/div[@class="bmsg job_msg inbox"]/text()')
        #print('职位信息', date)
        describes = re.findall(re.compile('<div class="bmsg job_msg inbox">(.*?)div class="mt10"', re.S), r1.text)
        describe = describes[0].strip().replace('<p>', '').replace('</p>', '').replace('<p>', '').replace('<span>', '').replace('</span>', '').replace('\t', '').replace('<', '').replace('<br>', '').replace('\n', '').replace('&nbsp;','')
        #print('职位信息', describe)
        #describes1 = re.findall(re.compile('<div class="bmsg inbox">(.*?)</div>', re.S), r1.text)
        #company0 = describes1[0].strip().replace('<p>', '').replace('</p>', '').replace('<p>', '').replace('<span>','').replace('</span>', '').replace('\t', '').replace('&nbsp;','').replace('<br>', '').replace('<br>', '')
        describes1 = t1.xpath('//p[@class="fp"]/text()')
        #print('len(describes1',len(describes1))
        #print('describes1',describes1)
        company0 =describes1[-1]
        print('公司信地址', company0)
        companytypes = t1.xpath('//div[@class="com_tag"]/p/text()')[:2]
        company1 = ''
        for i in companytypes:
            company1 = company1 + ' ' + i
        #print('公司信息', company1)
        salary=t1.xpath('//div[@class="cn"]/h1/strong/text()')
        salary = re.findall(re.compile(r'div class="cn">.*?<strong>(.*?)</strong>',re.S),r1.text)[0]
        #print('薪水',salary)
        labels = t1.xpath('//div[@class="jtag"]/div[@class="t1"]/span/text()')
        label = ''
        for i in labels:
            label = label + ' ' + i
        #print('待遇',label)
        item=[]
        item.append(str(company))
        item.append(str(job))
        item.append(str(education))
        item.append(str(area))
        item.append(str(salary))
        item.append(str(label))
        item.append(str(workyear))
        item.append(str(people))
        item.append(str(date))
        item.append(str(describe))
        item.append(str(company0))
        item.append(str(company1))
        #print(item)
    except:
        traceback.print_exc()
        pass

    return item
def run():
    workbook = xlwt.Workbook(encoding='utf-8')
    sheet1 = workbook.add_sheet('message', cell_overwrite_ok=True)
    row0 = ["公司", "岗位", "学历", "地区","薪水", "待遇","工作经验", "需求人数", "URL", "职位信息", "联系方式", "公司信息及人数"]
    for j in range(0, len(row0)):
        sheet1.write(0, j, row0[j])
    try:
        i=0
        for a in range(1,72): ##控制页数
            print('正在爬取第{}页信息'.format(a))
            try:
                # time.sleep(random.random()+random.randint(1,5))
                links= get_links(a)

                try:
                    for link in links:
                        time.sleep(random.random() + random.randint(0, 1))
                        #i=i+1
                        print(i)
                        print(link)
                        item=get_content(link)
                        sheet1.write(i + 1, 0, item[0])
                        sheet1.write(i + 1, 1, item[1])
                        sheet1.write(i + 1, 2, item[2])
                        sheet1.write(i + 1, 3, item[3])
                        sheet1.write(i + 1, 4, item[4])
                        sheet1.write(i + 1, 5, item[5])
                        sheet1.write(i + 1, 6, item[6])
                        sheet1.write(i + 1, 7, item[7])
                        sheet1.write(i + 1, 8, item[8])
                        sheet1.write(i + 1, 9, item[9])
                        sheet1.write(i + 1, 10, item[10])
                        sheet1.write(i + 1, 11, item[11])
                        i = i + 1
                except:
                    traceback.print_exc()
                    continue
            except IndexError:
                traceback.print_exc()
                pass

    except IndexError:
        traceback.print_exc()
        pass
    workbook.save('51job.xls')
run()

在这里插入图片描述
从页面下角获取总页数,然后再代码里修改:
在这里插入图片描述

导入出现问题要用 install:
在这里插入图片描述
在这里插入图片描述

已标记关键词 清除标记
相关推荐
©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页