一.BeautifulSoup 多进程抓取智联招聘信息,并且存储到mongodb

# coding:utf-8
import re
import requests
import urllib2
import datetime
from bs4 import BeautifulSoup
from pymongo import MongoClient
from multiprocessing import Pool

#开启数据库
cn=MongoClient('localhost',27017)
db=cn.job
table=db.zhilian
#初始化数据库
table.remove({})

#初始化测试数据
job=u'测试'
place=u'全国'
job_url=urllib2.quote(job.encode('utf-8'))
place_url=urllib2.quote(place.encode('utf-8'))

#获取页数
def get_page():
    url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl={}&kw={}&p=1&kt=3'.format(place_url,job_url)
    wbdata = requests.get(url).content
    soup = BeautifulSoup(wbdata, 'lxml')
    items = soup.select("div#newlist_list_content_table > table")
    count = len(items) - 1
    # 每页职位信息数量
    print u'每个页面的有%s条数据'%count
    job_count_data = soup.find('span',class_="search_yx_tj")
    job_count=re.search(r'\d{1,}',job_count_data.get_text()).group()

    print u'满足搜索条件职位有%s个'%job_count
    # 搜索结果页数
    pages = (int(job_count)/count) + 1
    print u'一共有%d页面'%pages
    return pages

#主程序
def get_zhaopin(page_):
    headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl={}&kw={}&p={}&kt=3'.format(place_url,job_url,page_)

    wbdata = requests.get(url,headers=headers).content
    soup = BeautifulSoup(wbdata,'lxml')

    job_name = soup.select("table.newlist > tr > td.zwmc > div > a")
    salarys = soup.select("table.newlist > tr > td.zwyx")
    locations = soup.select("table.newlist > tr > td.gzdd")
    times = soup.select("table.newlist > tr > td.gxsj > span")

    for name, salary, location, time in zip(job_name, salarys, locations, times):
        data = {
            'name': name.get_text(),
            'salary': salary.get_text(),
            'location': location.get_text(),
            'time': time.get_text(),
        }
        table.insert_one(data)

#开启5进程
if __name__=="__main__":
    page=get_page()
    start_time=datetime.datetime.now()
    pool = Pool(processes=5)
    pool.map_async(get_zhaopin,range(1,page+1))
    pool.close()
    pool.join()
    end_time=datetime.datetime.now()
    print u'发费总时间为:%sS'%(end_time-start_time).seconds
  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值