#!/user/bin/env python3 # -*- coding: utf-8 -*- import requests from lxml import etree from math import ceil from mongodb_config import mongo_info header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 ' 'Safari/537.36'} # 获取岗位页数 def getJobPage(url): res = requests.get(url, headers=header) html = etree.HTML(res.text) total = html.xpath('//div[@class="left"]/span/text()')[0] job_page = ceil(int(total) / 10) return job_page # 获取详情页职位描述 def getJobOrder(url): # url = 'https://hr.tencent.com/position_detail.php?id=49256&keywords=&tid=0&lid=0' res = requests.get(url, headers=header) html = etree.HTML(res.text) detail_info = html.xpath('//table[@class="tablelist textl"]') for item in detail_info: # 工作职责 job_request = item.xpath('tr[3]//ul[@class="squareli"]/li/text()') # 工作要求 job_order = item.xpath('tr[4]/td/ul[@class="squareli"]/li/text()') # print(job_request) # print(job_order) list_data = [job_request, job_order] return list_data # 获取列表页岗位信息 def getJobInfo(url): res = requests.get(url, headers=header) html = etree.HTML(res.text) job_list = html.xpath('//tr[@class="even" or @class="odd"]') list_data = [] href = 'https://hr.tencent.com/' for list_job in job_list: dict_item = {} # print(etree.tostring(list_job, encoding='utf-8').decode('utf-8')) detail_url = href + list_job.xpath('.//a/@href')[0] job_name = list_job.xpath('.//td[1]/a/text()')[0] job_type = list_job.xpath('.//td[2]/text()')[0] job_people = list_job.xpath('.//td[3]/text()')[0] job_addre = list_job.xpath('.//td[4]/text()')[0] job_time = list_job.xpath('.//td[5]/text()')[0] # 工作职责 job_request = getJobOrder(detail_url)[0] # 工作要求 job_order = getJobOrder(detail_url)[1] dict_item['detail_url'] = detail_url dict_item['job_name'] = job_name dict_item['job_type'] = job_type dict_item['job_people'] = job_people dict_item['job_addre'] = job_addre dict_item['job_time'] = job_time dict_item['job_request'] = job_request dict_item['job_order'] = job_order list_data.append(dict_item) mongo_info.update_tencent(list_data) print(list_data) if __name__ == '__main__': mainurl = 'https://hr.tencent.com/position.php?&start=0#a' jobPage = getJobPage(mainurl) for page in range(jobPage): pageUrl = 'https://hr.tencent.com/position.php?&start=' + str(page * 10) + '#a' print("第" + str(page + 1) + "页") getJobInfo(pageUrl) # getJobOrder()
#!/user/bin/env python3 # -*- coding: utf-8 -*- from pymongo import MongoClient class Connect_mongo(object): def __init__(self): # 无密码认证使用该代码 # self.client = MongoClient(host='127.0.0.1', port=27017) # self.client = MongoClient(host='192.168.1.191', port=27017) # 有密码认证使用该代码 self.client = MongoClient(host='127.0.0.1', port=27017) # self.client = MongoClient(host='192.168.1.193', port=27017) # 数据库名loan self.database = self.client.loan self.tencent_database = self.client.tencent # 认证用户密码 self.dbinfo = self.database.authenticate('xxx', 'xxxx') self.tencent_dbinfo = self.tencent_database.authenticate('xxx', 'xxx') # 无密码认证 # 查询全部产品名 def find_all_name(self): _database = self.client.loan _collection = _database.loan_datase_sums loan_name = _collection.find({}, {'_id': 0, 'loan_name': 1}) return loan_name # 去重查询 def find_distinct_name(self): _database = self.client.loan _collection = _database.loan_datase_sums loan_name = _collection.distinct('loan_name') return loan_name def update_data(self, document): _client = self.client _database = self.database self.dbinfo _collection = _database.loan_datase_sums for res in document: result = _collection.update_many({'loan_name': res['loan_name']}, {'$set': res}, upsert=True) return result def update_rong360(self, document): _client = self.client _database = self.database self.dbinfo _collection = _database.rong360 for res in document: result = _collection.update_many({'loan_name': res['loan_name']}, {'$set': res}, upsert=True) return result def update_tencent(self, document): _client = self.client _database = self.tencent_database self.tencent_dbinfo _collection = _database.advertise_datase for res in document: result = _collection.update_many({'job_name': res['job_name']}, {'$set': res}, upsert=True) return result mongo_info = Connect_mongo()