爬取拉钩网 python有关的职位信息 存入mongo数据库

前一篇是把数据存入csv。
本篇将把数据存入mongodb数据库,并结束对拉勾网内容的爬取,后面看情况再爬取其他招聘网站的信息。
代码如下:

import requests
import json
import re
from bs4 import BeautifulSoup
import time
import bs4
from multiprocessing import Pool
import threading
from pymongo import MongoClient

head={"user-agent": "Mozilla/5.0",
      "referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
      }
count_page=0
cookies=None

con=MongoClient("localhost",27017)
db=con.lagou   #没有则创建数据库
clt=db.lagou   #没有则创建集合

def get_cookie():
    try:
        url="https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
        sses=requests.session()
        sses.get(url,headers=head)
        cookies=sses.cookies
        cookies=cookies.get_dict()
        if cookies:
            return cookies
    except Exception as ex:
        print(ex)

def getHtml(url,judge=None):
    global cookies
    try:
        if judge:
            html=requests.get(url, headers=head,cookies=cookies)
        else:
            cookies=get_cookie()
            html = requests.get(url, headers=head,cookies=cookies)
        if not re.search("您操作太频繁,请稍后再访问",html.text):
            html.raise_for_status()
            html.encoding = "utf-8"
            return html.text
        else:
            print("——————————————————访问频繁,令judge为None,重新获取cookies进行访问————————————————————")
            return getHtml(url,judge=None)
    except Exception as ex:
        print("出现异常",ex)


def parseFirstHtml(content):    #解析主页面,获取子页面的id号
    FirstHtmlList=[]
    try:
        content=content.split('"positionResult":',1)[1] #提取出适合用json的字符串
        content=content.rsplit(',"pageSize"',1)[0]  #提取出适合用json的字符串
        content=json.loads(content)["result"] #json字典化
        for i in content:#关键词作为列表,添加进FirstHtmlList
            index=i["positionId"]
            FirstHtmlList.append(index)
        FristList=FirstHtmlList
        return FristList    #保证每次的FristList仅有15个,并不断更新
    except Exception as ex:
        print(ex,content,"返回的页面内容有问题,解析失败")

def parseSecondHtml(secondcontent,url):     #解析子页面,获取薪水、地点、年限、学历和工作内容的信息
    soup = BeautifulSoup(secondcontent, "html.parser")
    if isinstance(soup.find("h2", attrs={"class": "title"}),bs4.element.Tag):
        try:
            title = soup.find("h2", attrs={"class": "title"}).get_text()
            introduce = soup.find("div", attrs={"class": "items"}).get_text()  # 提炼出薪水、地点、年限、学历
            introduce = introduce.strip().split("\n")
            new_intro = []
            for i in introduce:
                if i != "":
                    i = i.strip()
                    new_intro.append(i)
            salary, loc, years, degree = new_intro[0], new_intro[1], new_intro[3], new_intro[4]  # 提炼出薪水、地点、年限、学历
            work_content = soup.find("div", attrs={"class": "content"}).get_text()

            outdict ={"salary":salary,"loc":loc,"years":years,"degree":degree,"contert":work_content}   #将信息存为字典,方便存入mongo数据库
            return outdict
        except Exception as ex:
            print(ex, soup.find("h2", attrs={"class": "title"}), soup)
    else:
        print("页面出现问题,正在加载导致内容无法提取",url)
        return None

def outputMongo(clt,outdict):    #将子页面的内容,存入mongo
    clt.insert_one(outdict)


def multi_main(url_2):  #多线程,执行子页面的获取、解析和输出
    global f
    secondcontent = getHtml(url_2, judge=1)
    outdict = parseSecondHtml(secondcontent, url_2)
    if outdict:
        outputMongo(clt,outdict)
    else:
        secondcontent = getHtml(url_2, judge=None)
        outList = parseSecondHtml(secondcontent, url_2)
        outputMongo(clt,outdict)

'''
多线程
'''
def firstmain(page):    #多进程,执行主页面的获取、解析和输出子页面的id号
    global count_page
    Firsturl="https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false&pn="+str(page)+"&kd=python"
    Fristcontent=getHtml(Firsturl)
    if Fristcontent:
        FirstList=parseFirstHtml(Fristcontent)
    else:
        print("%s页面提取失败"%Firsturl)
    try:
        if not re.search("positionId",Fristcontent):
            print("————————————————————出现警告,获取内容失败——————————————————————")
            return
        else:
            count_page+=1
            print("成功打印page=%s页面,打印页数为%s"%(page,count_page))
    except Exception as ex:
        print(ex,Fristcontent,type(Fristcontent))
    if FirstList:
        #采用多线程
        for i in FirstList:
            url_2 = "https://m.lagou.com/jobs/" + str(i) + ".html"
            t=threading.Thread(target=multi_main,args=(url_2,))
            t.start()

'''
多进程
'''
if __name__ == '__main__':
    time_1=time.time()
    print(time.asctime(time.localtime(time.time())),time_1)
    pp=Pool(4)
    for page in range(1,40):
        pp.apply_async(firstmain,(page,))
    pp.close()
    pp.join()
    time_2=time.time()
    if con:
        con.close()
    print(time.asctime(time.localtime(time.time())),"所花时间为%s"%(time_2-time_1))

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值