代码一:
将爬取回来的所有数据,加上根据不同的值生成的hash值一起存入mongo,为了防止再次爬数据的时候重复提交数据,加入了存入数据库之前的数据验证(即存入数据库之前验证数据库中是否已经存在该hash值)
#coding=utf-8
import requests,pymongo,math,json
import sys,re,ConfigParser,random
import numpy as np
import hashlib
import time
reload(sys)
sys.setdefaultencoding('utf-8')
config = ConfigParser.ConfigParser()
config.read('config.conf')
class lagouspiders:
def __init__(self):
self.headers = { # 请求头文件
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:54.0) Gecko/20100101 Firefox/54.0',
'Host': 'www.lagou.com',
'Referer': 'https://www.lagou.com/jobs/list_python?px=default&city=%E5%8C%97%E4%BA%AC',
'X-Anit-Forge-Code': '0',
'X-Anit-Forge-Token': 'None',
'X-Requested-With': 'XMLHttpRequest'
}
self.data = { # 请求参数
'first': 'true',
'kd': config.get('lagoumsg', 'kd'), # 搜索条件:职位名称
# 'pn':config.get('lagoumsg','pn'), #页码
'city': config.get('lagoumsg', 'city') # 搜索条件:地址
}
self.proxy_list = [ # 设置代理
{'http': '202.117.120.242:8080'}, #
{'http': '113.200.214.164:9999'}, #
{'http': '27.46.5.97:9797'}, #
{'http': '113.200.214.164:9999'}, #
{'http': '42.157.5.154:9999'}, #
{'http': '113.118.96.46:9797'}, #
{'http': '210.26.125.142:8080'}, #
]
self.proxy = random.choice(self.proxy_list)
def test_crawler(self):
result1 = requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0',headers=self.headers,data=self.data) #发起请求,获取拉钩数据
result_json1 = result1.json() #将获取到的数据转换为json格式
totalCount = result_json1['content']['positionResult']['totalCount'] # 获取所查询到的信息条数
city = result_json1['content']['positionResult']['locationInfo']['city'] # 获取所查询的城市信息
querypositionName = result_json1['content']['positionResult']['queryAnalysisInfo']['positionName'] # 获取所查询的职位名称
pageSize1 = result_json1['content']['pageSize']
page=math.ceil(float(totalCount) / pageSize1)
page=int(page)#页数
distinctcount = 0
listmin = []
listmax = []
for j in range(1,page+1):
result = requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0&pn='+str(j)+'',headers=self.headers,data=self.data,proxies = self.proxy)
result_json = result.json() #将获取到的数据转换为json格式
result_dict = json.loads(result.content) #将结果转为dict
resultinsret = result_dict['content']['positionResult']['result'] #需要存入mongo的数据
resultSize = result_json['content']['positionResult']['resultSize']
for i in range(0,resultSize): #将每一页的数据写入到mongo里面
#获取当天时间,并将时间+搜索的职位名称作为数据库名称
date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
databasename = str(date) + str(querypositionName)
# 定义mongo数据库
client = pymongo.MongoClient('192.168.20.155',5555)
rent_info = client[databasename] # 给数据库命名
sheet_table = rent_info['sheet_table'] #创建表单
salary = result_json['content']['positionResult']['result'][i]['salary'] # 薪资范围
salary_num = re.findall(r"\d+", str(salary)) #将得到的薪资范围转换
salary_max = salary_num[1] # 工资上限
salary_min = salary_num[0] # 工资下限
# #生成hash值
companyFullName = str(result_json['content']['positionResult']['result'][i]['companyFullName'])
positionName = str(result_json['content']['positionResult']['result'][i]['positionName'])
salary_max = str(salary_max)
salary_min = str(salary_min)
resultand = companyFullName + position