58同城爬取

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Time:2018/6/4
"""
第一步:获得网站
        http://mas.58.com/job/?PGTID=0d100000-007f-77d5-31ab-9a3915d1f878&ClickID=2#&key=
第二步:获得该主页下的所有招聘url,和 薪资 补助 名称 要求
第三步:通过遍历所有的招聘url,获得该工作的地址
第四步:把这些信息放入数据表中
     
"""
import requests
import re
from bs4 import BeautifulSoup
import threading
import xlwt
infs=[]#用列表的方式储存信息
infnames=[]
infadds=[]
infsalaries=[]
infcops=[]
infwels=[]
infquals=[]
infexers=[]
def getUrl(Urls,html):
    soup=BeautifulSoup(html,'lxml')

def writeExcel():
    global infs,infnames,infadds,infsalaries,infcops,infwels,infquals,infexers
    f=xlwt.Workbook()
    sheet1=f.add_sheet(u'sheet',cell_overwrite_ok=True)
    for i in range(len(infnames)):
        sheet1.write(i,0,infnames[i])#将这些信息写入excel
        sheet1.write(i,1,infadds[i])
        sheet1.write(i,2,infsalaries[i])
        sheet1.write(i,3,infcops[i])
        sheet1.write(i,4,infwels[i])
        sheet1.write(i,5,infquals[i])
        sheet1.write(i,6,infexers[i])
    f.save('58.xls')
def getInf(soup):
    global infs,infnames,infadds,infsalaries,infcops,infwels,infquals,infexers
    bsInfos=soup.find_all('li',class_='job_item clearfix')
    for bsinfo in bsInfos:
        address=bsinfo.find('span',class_="address")#地址
        address=address.string.strip()
        name=bsinfo.find('span',class_="name")#职位名称
        name=re.findall(r'[\u4e00-\u9fa5]+.[\u4e00-\u9fa5]+',str(name))
        name=''.join(name)
        salary=bsinfo.find('p',class_="job_salary")#工资多少
        salary=re.search(r'(\d*-\d*)|([\u4e00-\u9fa5]+)',str(salary))
        salary=salary.group()
        try:
            unit=bsinfo.find('i',class_="unit")#工资单位
            unit=unit.string.strip()
        except:
            unit='元/月'
        try:
            com=bsinfo.find('div',class_="comp_name")#公司名称
            company=re.search(r'[\u4e00-\u9fa5]+',str(com))
            comp=company.group()
        except:
            comp='无'
        try:
            wel=bsinfo.find('div',class_="job_wel clearfix")
            welfares=wel.contents
            wels=''
            for welfare in welfares:
                wels=wels+welfare.string.strip()
                wels=wels+' '
        except:
            wels='无'
        xueli=bsinfo.find('span',class_="xueli")
        xueli=re.search(r'[\u4e00-\u9fa5]+',str(xueli))
        xueli=xueli.group()
        
        exer=bsinfo.find('span',class_="jingyan")
        exer=re.search(r'[\u4e00-\u9fa5]+|\d?-\d?年',str(exer))
        exer=exer.group()
        final='职位名称:'+name+'职位'+'  '+"公司:"+comp+'  '+"薪水:"+salary+unit+'  '+"地址:"+address+'   '+'福利:'+wels+'  '+'学历:'+xueli+'  '+'经验:'+exer
        infs.append(final)
        infnames.append(name)
        infadds.append(address)
        infsalaries.append(salary+unit)
        infcops.append(comp)
        infwels.append(wels)
        infquals.append(xueli)
        infexers.append(exer)
        wels=''
def main():
    ths=[]
    for i in range(71):
        if(i==0):
            url='http://mas.58.com/job/?PGTID=0d100000-007f-77d5-31ab-9a3915d1f878&ClickID=2#&key='
        elif(i==1):
            continue
        else:
            url='http://mas.58.com/job/'+'/pn'+str(i)+'/?PGTID=0d100000-007f-77d5-31ab-9a3915d1f878&ClickID=2#&key='
        html=requests.get(url)
        html=html.text
        soup=BeautifulSoup(html,'lxml')
        getInf(soup)
    writeExcel()
    """for i in range(71):
        if(i==0):
            url='http://mas.58.com/job/?PGTID=0d100000-007f-77d5-31ab-9a3915d1f878&ClickID=2#&key='
        elif(i==1):
            continue
        else:
            url='http://mas.58.com/job/'+'/pn'+str(i)+'/?PGTID=0d100000-007f-77d5-31ab-9a3915d1f878&ClickID=2#&key='
        html=requests.get(url)
        html=html.text
        soup=BeautifulSoup(html,'lxml')
        th=threading.Thread(target=getInf,args=[soup])
        th.start()
        ths.append(th)
        for i in ths:
            i.join()
    print("一共有%s条信息"%(len(infs)))"""
        
main()

    
    


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值