爬取腾讯招聘信息存入mongodb数据库

SpiderTencent.py

import requests
from lxml import etree
import time
import pymongo
import random


class SpiderTencent(object):
    def __init__(self):
        """
        初始化url
        网页页码下标地址
        爬虫控制开关
        """
        self.url = "http://hr.tencent.com/position.php?&start="
        self.index = 0
        self.switch = True
        self.tencent_data = []  # 创建一个列表用来存储tencent招聘信息

    def con_mongodb(self):
        """
        创建mongodb对象
        连接mongodb
        """
        client = pymongo.MongoClient(host="localhost", port=27017)
        db = client.py3
        collection = db.tencent
        for data in self.tencent_data:
            collection.insert(data)
        print("已将数据全部存入到mongodb中!")

    def get_html(self, url):
        """
            加载html页面,并解析为xml文档
        """
        headers_list = [
            {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0"},
            {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32"},
            {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"},
            {"User-Agent": "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"},
            {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"},
            {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0"},
        ]
        headers = random.choice(headers_list)
        try:
            response = requests.get(url=url, headers=headers, timeout=20)
        except:
            print("have a error")
        finally:
            response = requests.get(url=url, headers=headers, timeout=20)
        html = response.text
        content = etree.HTML(html)
        return content

    def load_page(self, url):
        """
        利用xpaht获取信息,存入mongodb中
        """
        content = self.get_html(url)
        job_title = content.xpath('(//tr[@class="even"] | //tr[@class="odd"])//a/text()')    # 职位名称
        job_category = content.xpath('//tr[@class="even"]//td[2]//text() | //tr[@class="odd"]//td[2]//text()')   # 职位类别
        number = content.xpath('//tr[@class="even"]//td[3]//text() | //tr[@class="odd"]//td[3]//text()')  # 人数
        location = content.xpath('//tr[@class="even"]//td[4]//text() | //tr[@class="odd"]//td[4]//text()')  # 地点
        info_list = zip(job_title, job_category, number, location)  # 整合信息
        for info in info_list:
            info = {"job_title": info[0], "job_category": info[1], "number": info[2], "location": info[3]}  # 拼接成字典
            self.tencent_data.append(info)
        print(info_list)
        print("正在获取数据" + "-" * 10)

    def start_switch(self):
        """
        开启控制开关
        """
        while self.switch:
            tencent_url = self.url + str(self.index)  # 拼接url地址
            self.load_page(tencent_url)
            time.sleep(5)
            if self.index < 2500:   # 判断是否到了最后一页
                self.index += 10
            else:
                self.switch = False
                self.con_mongodb()  # 将数据存到mongodb中
                print("程序结束")


if __name__ == '__main__':
    tencent = SpiderTencent()
    tencent.start_switch()

 

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值