python爬虫 --爬取链家项目【中级版】

本文介绍如何在Python爬虫项目中应用代理池来爬取链家网站的数据,建议查阅作者之前的代理池设置文章。
摘要由CSDN通过智能技术生成

爬取链家项目需要用到代理池 ,代理池具体怎么设置可以去翻我之前的文章

import hashlib

import requests
from lxml import etree
import pymongo
import time,re
class Lianjia(object):
    def __init__(self,url):
        self.url = url
        self.proxies = self.get_proxies()
        # 创建连接
        self.client = pymongo.MongoClient(host='localhost', port=27017)
        # 用client连接数据库
        self.db = self.client['lianjia']
        self.main()
    def get_proxies(self):
        try:
            response = requests.get('http://localhost:5000/get')
            proxies = {
                'http': 'http://' + response.text
            }
            return proxies
        except Exception:
            return None
    #请求url,获取xpath
    def get_xpath_by_requests(self,url,proxies):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
            'Referer': 'https://bj.lianjia.com/?utm_source=baidu&utm_medium=pinzhuan&utm_term=biaoti&utm_content=biaotimiaoshu&utm_campaign=sousuo&ljref=pc_sem_baidu_ppzq_x',
        }
        try:
            response = requests.get(url, headers=headers, proxies=proxies)
            # print(response.text)
            return etree.HTML(response.text)
        except Exception:
            #重新获取代理
            proxies_new = self.get_proxies()
            print('重新获取代理',proxies)
            return self.get_xpath_by_requests(url,proxies_new)


    def get_text(self,text):
        if text:
            return text[0]
        return ''
    def get_md5(self,value):
        md5 = hashlib.md5(bytes(value,encoding='utf-8'))
        return md5.hexdigest()

    def write_to_mongo(self,item):
        # db['beijing'].insert(item)
        item['hash_url'] = self.get_md5(item['detail_url'])
        self.db['beijing'].update({'hash_url':item['hash_url']},{'$set':item},True)


    def parse_page(self,div_list):
        for div in div_list:
            title =self.get_text(div.xpath('.//p[@class="content__list--item--title twoline"]/a/text()')).strip()
            # print(title)
            price = self.get_text(div.xpath('.//span[@class="content__list--item-price"]/em/text()'))
            # print(price)
            #地址
            #大小
            #朝向
            #规格
            #详情页链接
            detail_url = self.get_text(div.xpath('.//p[@class="content__list--item--title twoline"]/a/@href'))
            item = {}
            item['title'] = title
            item['price'] = price
            item['detail_url'] = detail_url
            # print(item)
            self.write_to_mongo(item)
    def parse_area(self,url):
        html = self.get_xpath_by_requests(url, self.proxies)
        # print(div_list)
        #第一种分页
        i=1
        while True:
            page_url = url+'pg{}'.format(i)
            html = self.get_xpath_by_requests(page_url, self.proxies)
            div_list = html.xpath('//div[@class="content__list"]/div')
            if not div_list:
                break
            self.parse_page(div_list)
            i+=1
        #第二种:获取最大页
        # max_page = html.xpath('//a[@class="next"]/text()')
        # print(max_page)
        # for i in range(1,int(max_page)+1):
        #     pass
        #第三种:获取下一页,模仿点击下一页操作。
        #先解析该页
        #在获取下一页
        #判断下一页是否为空,不为空就回拼接下一页的url,调自己。
    def main(self,):
        html = self.get_xpath_by_requests(self.url,self.proxies)
        #获取区域列表
        areas = html.xpath('//div[@id="filter"]/ul[2]/li[position()>1]/a/@href')
        # print(areas)
        for area in areas:
            area_url = 'https://bj.lianjia.com'+area
            self.parse_area(area_url)
if __name__ == '__main__':
    base_url = 'https://bj.lianjia.com/zufang/'

    Lianjia(base_url)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值