前一篇是把数据存入csv。
本篇将把数据存入mongodb数据库,并结束对拉勾网内容的爬取,后面看情况再爬取其他招聘网站的信息。
代码如下:
import requests
import json
import re
from bs4 import BeautifulSoup
import time
import bs4
from multiprocessing import Pool
import threading
from pymongo import MongoClient
head={"user-agent": "Mozilla/5.0",
"referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
}
count_page=0
cookies=None
con=MongoClient("localhost",27017)
db=con.lagou #没有则创建数据库
clt=db.lagou #没有则创建集合
def get_cookie():
try:
url="https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
sses=requests.session()
sses.get(url,headers=head)
cookies=sses.cookies
cookies=cookies.get_dict()
if cookies:
return cookies
except Exception as ex:
print(ex)
def getHtml(url,judge=None):
global cookies
try:
if judge:
html=requests.get(url, headers=head,cookies=cookies)
else:
cookies=get_cookie()
html = requests.get(url, headers=head,cookies=cookies)
if not re.search("您操作太频繁,请稍后再访问",html.text):
html.raise_for_status()
html.encoding = "utf-8"
return html.text
else:
print("——————————————————访问频繁,令judge为None,重新获取cookies进行访问————————————————————")
return getHtml(url,judge=None)
except Exception as ex:
print("出现异常",ex)
def parseFirstHtml(content): #解析主页面,获取子页面的id号
FirstHtmlList=[]
try:
content=content.split('"positionResult":',1)[1] #提取出适合用json的字符串
content=content.rsplit(',"pageSize"',1)[0] #提取出适合用json的字符串
content=json.loads(content)["result"] #json字典化
for i in content:#关键词作为列表,添加进FirstHtmlList
index=i["positionId"]
FirstHtmlList.append(index)
FristList=FirstHtmlList
return FristList #保证每次的FristList仅有15个,并不断更新
except Exception as ex:
print(ex,content,"返回的页面内容有问题,解析失败")
def parseSecondHtml(secondcontent,url): #解析子页面,获取薪水、地点、年限、学历和工作内容的信息
soup = BeautifulSoup(secondcontent, "html.parser")
if isinstance(soup.find("h2", attrs={"class": "title"}),bs4.element.Tag):
try:
title = soup.find("h2", attrs={"class": "title"}).get_text()
introduce = soup.find("div", attrs={"class": "items"}).get_text() # 提炼出薪水、地点、年限、学历
introduce = introduce.strip().split("\n")
new_intro = []
for i in introduce:
if i != "":
i = i.strip()
new_intro.append(i)
salary, loc, years, degree = new_intro[0], new_intro[1], new_intro[3], new_intro[4] # 提炼出薪水、地点、年限、学历
work_content = soup.find("div", attrs={"class": "content"}).get_text()
outdict ={"salary":salary,"loc":loc,"years":years,"degree":degree,"contert":work_content} #将信息存为字典,方便存入mongo数据库
return outdict
except Exception as ex:
print(ex, soup.find("h2", attrs={"class": "title"}), soup)
else:
print("页面出现问题,正在加载导致内容无法提取",url)
return None
def outputMongo(clt,outdict): #将子页面的内容,存入mongo
clt.insert_one(outdict)
def multi_main(url_2): #多线程,执行子页面的获取、解析和输出
global f
secondcontent = getHtml(url_2, judge=1)
outdict = parseSecondHtml(secondcontent, url_2)
if outdict:
outputMongo(clt,outdict)
else:
secondcontent = getHtml(url_2, judge=None)
outList = parseSecondHtml(secondcontent, url_2)
outputMongo(clt,outdict)
'''
多线程
'''
def firstmain(page): #多进程,执行主页面的获取、解析和输出子页面的id号
global count_page
Firsturl="https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false&pn="+str(page)+"&kd=python"
Fristcontent=getHtml(Firsturl)
if Fristcontent:
FirstList=parseFirstHtml(Fristcontent)
else:
print("%s页面提取失败"%Firsturl)
try:
if not re.search("positionId",Fristcontent):
print("————————————————————出现警告,获取内容失败——————————————————————")
return
else:
count_page+=1
print("成功打印page=%s页面,打印页数为%s"%(page,count_page))
except Exception as ex:
print(ex,Fristcontent,type(Fristcontent))
if FirstList:
#采用多线程
for i in FirstList:
url_2 = "https://m.lagou.com/jobs/" + str(i) + ".html"
t=threading.Thread(target=multi_main,args=(url_2,))
t.start()
'''
多进程
'''
if __name__ == '__main__':
time_1=time.time()
print(time.asctime(time.localtime(time.time())),time_1)
pp=Pool(4)
for page in range(1,40):
pp.apply_async(firstmain,(page,))
pp.close()
pp.join()
time_2=time.time()
if con:
con.close()
print(time.asctime(time.localtime(time.time())),"所花时间为%s"%(time_2-time_1))