利用python3.7.2 爬虫51job想找一份离家近的工作

最新推荐文章于 2021-10-31 11:23:01 发布

LaLaLa_OvO

最新推荐文章于 2021-10-31 11:23:01 发布

阅读量242

点赞数

分类专栏：面试文章标签：爬虫 python

本文链接：https://blog.csdn.net/qq56477643/article/details/95041395

版权

面试专栏收录该内容

2 篇文章 0 订阅

订阅专栏

从来没碰过python，从网上找的例子模仿着写了一个抓取 51job，武汉地， java招聘的爬虫
参考例子

运行效果：
在这里插入图片描述
数据样式：

代码：

import urllib
import re, codecs
import time, random
import requests
from lxml import html
from urllib import parse
import xlwt
import traceback

key = 'java'
key = parse.quote(parse.quote(key))
headers = {'Host': 'search.51job.com',
           'Upgrade-Insecure-Requests': '1',
           'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}

def get_links(i):
    url = 'http://search.51job.com/list/180200,000000,0000,00,9,99,' + key + ',2,' + str(i) + '.htmll'
    r = requests.get(url, headers, timeout=10)
    s = requests.session()
    s.keep_alive = False
    r.encoding = 'gbk'
    reg = re.compile(r'class="t1 ">.*? <a target="_blank" title=".*?" href="(.*?)".*? <span class="t2">', re.S)
    reg1 = re.compile(r'class="t1 ">.*? <a target="_blank" title="(.*?)".*?<span class="t2"><a target="_blank" title="(.*?)" href="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*? <span class="t5">(.*?)</span>',re.S)  # 匹配换行符    
    links = re.findall(reg, r.text)
    #print(links)
    return links


# 多页处理，下载到文件
def get_content(link):
    r1 = requests.get(link, headers, timeout=10)
    s = requests.session()
    s.keep_alive = False
    r1.encoding = 'gbk'
    t1 = html.fromstring(r1.text)
    try:
        job = t1.xpath('//div[@class="tHeader tHjob"]//h1/text()')[0].strip()
        company = t1.xpath('//p[@class="cname"]/a/text()')[0].strip()
        #print('工作：', job)
        print('公司：', company)
        area = t1.xpath('//div[@class="tHeader tHjob"]//p[@class="msg ltype"]/text()')[0].strip()
        #print('地区', area)
        workyear = t1.xpath('//div[@class="tHeader tHjob"]//p[@class="msg ltype"]/text()')[1].strip()
        #print('工作经验', workyear)
        education = t1.xpath('//div[@class="tHeader tHjob"]//p[@class="msg ltype"]/text()')[2].strip()
        #print('学历:', education)
        people = t1.xpath('//div[@class="tHeader tHjob"]//p[@class="msg ltype"]/text()')[3].strip()
        #print('人数', people)
        #date = t1.xpath('//div[@class="tHeader tHjob"]//p[@class="msg ltype"]/text()')[4].strip()
        date =link
        #print('发布日期', date)
        # date = t1.xpath('//div[@class="tBorderTop_box"]/div[@class="bmsg job_msg inbox"]/text()')
        #print('职位信息', date)
        describes = re.findall(re.compile('<div class="bmsg job_msg inbox">(.*?)div class="mt10"', re.S), r1.text)
        describe = describes[0].strip().replace('<p>', '').replace('</p>', '').replace('<p>', '').replace('<span>', '').replace('</span>', '').replace('\t', '').replace('<', '').replace('<br>', '').replace('\n', '').replace('&nbsp;','')
        #print('职位信息', describe)
        #describes1 = re.findall(re.compile('<div class="bmsg inbox">(.*?)</div>', re.S), r1.text)
        #company0 = describes1[0].strip().replace('<p>', '').replace('</p>', '').replace('<p>', '').replace('<span>','').replace('</span>', '').replace('\t', '').replace('&nbsp;','').replace('<br>', '').replace('<br>', '')
        describes1 = t1.xpath('//p[@class="fp"]/text()')
        #print('len(describes1',len(describes1))
        #print('describes1',describes1)
        company0 =describes1[-1]
        print('公司信地址', company0)
        companytypes = t1.xpath('//div[@class="com_tag"]/p/text()')[:2]
        company1 = ''
        for i in companytypes:
            company1 = company1 + ' ' + i
        #print('公司信息', company1)
        salary=t1.xpath('//div[@class="cn"]/h1/strong/text()')
        salary = re.findall(re.compile(r'div class="cn">.*?<strong>(.*?)</strong>',re.S),r1.text)[0]
        #print('薪水',salary)
        labels = t1.xpath('//div[@class="jtag"]/div[@class="t1"]/span/text()')
        label = ''
        for i in labels:
            label = label + ' ' + i
        #print('待遇',label)
        item=[]
        item.append(str(company))
        item.append(str(job))
        item.append(str(education))
        item.append(str(area))
        item.append(str(salary))
        item.append(str(label))
        item.append(str(workyear))
        item.append(str(people))
        item.append(str(date))
        item.append(str(describe))
        item.append(str(company0))
        item.append(str(company1))
        #print(item)
    except:
        traceback.print_exc()
        pass

    return item
def run():
    workbook = xlwt.Workbook(encoding='utf-8')
    sheet1 = workbook.add_sheet('message', cell_overwrite_ok=True)
    row0 = ["公司", "岗位", "学历", "地区","薪水", "待遇","工作经验", "需求人数", "URL", "职位信息", "联系方式", "公司信息及人数"]
    for j in range(0, len(row0)):
        sheet1.write(0, j, row0[j])
    try:
        i=0
        for a in range(1,72): ##控制页数
            print('正在爬取第{}页信息'.format(a))
            try:
                # time.sleep(random.random()+random.randint(1,5))
                links= get_links(a)

                try:
                    for link in links:
                        time.sleep(random.random() + random.randint(0, 1))
                        #i=i+1
                        print(i)
                        print(link)
                        item=get_content(link)
                        sheet1.write(i + 1, 0, item[0])
                        sheet1.write(i + 1, 1, item[1])
                        sheet1.write(i + 1, 2, item[2])
                        sheet1.write(i + 1, 3, item[3])
                        sheet1.write(i + 1, 4, item[4])
                        sheet1.write(i + 1, 5, item[5])
                        sheet1.write(i + 1, 6, item[6])
                        sheet1.write(i + 1, 7, item[7])
                        sheet1.write(i + 1, 8, item[8])
                        sheet1.write(i + 1, 9, item[9])
                        sheet1.write(i + 1, 10, item[10])
                        sheet1.write(i + 1, 11, item[11])
                        i = i + 1
                except:
                    traceback.print_exc()
                    continue
            except IndexError:
                traceback.print_exc()
                pass

    except IndexError:
        traceback.print_exc()
        pass
    workbook.save('51job.xls')
run()

在这里插入图片描述
从页面下角获取总页数，然后再代码里修改：

导入出现问题要用 install：
在这里插入图片描述

LaLaLa_OvO

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
利用python3.7.2 爬虫51job想找一份离家近的工作

从来没碰过python，从网上找的例子模仿着写了一个抓取 51job，武汉地， java招聘的爬虫参考例子运行效果：数据样式：代码：import urllibimport re, codecsimport time, randomimport requestsfrom lxml import htmlfrom urllib import parseimport xlw...
复制链接

扫一扫