python爬虫---爬取链家新房

import re
import requests
import redis
from lxml import etree
from fake_useragent import UserAgent
import re
import pymysql
class CoityArea:
    def __init__(self):
        #初始化Redis链接
       self.r=self.get_redis()
    def __call__(self, *args, **kwargs):
       self.get_city_area()
    #redis数据库连接
    def get_redis(self):
        return  redis.Redis(host='127.0.0.1',port=6379,db=1)
    def get_city_area(self):
        base_url="https://bj.lianjia.com/zufang/"
        html_xml=self.get_html(base_url)
        city_area_list=html_xml.xpath('//ul[@data-target="area"]//li[position()>1]/a/text() | //ul[@data-target="area"]//li[position()>1]/a/@href')
        # print(city_area_list)
        # print(len(city_area_list))
        for city_area in city_area_list:
            if "zufang" in city_area:
                city_area="https://bj.lianjia.com"+city_area
                # print(city_area)
            #将城区信息插入数据库
            self.r.rpush("city_area_list",city_area)
    #获取指定的url对应的html页面
    def get_html(self,url):
        headers={
            'User-Agent':UserAgent().random
        }
        response = requests.get(url,headers=headers)
        html=response.text
        # print(html)
        return etree.HTML(html)
class BusinessCicle(CoityArea):
    def __call__(self, *args, **kwargs):
        self.get_business_cicle()
    #通过城区的url获取商圈的url
    def get_business_cicle(self):
        #查询城区信息
        city_area_list=self.r.lrange("city_area_list",0,-1)
        # print(city_area_list)
        for index in range(0,len(city_area_list),2):
            #获取城区的url和城区的名称
            city_area_url=city_area_list[index].decode('utf-8')
            city_area_name=city_area_list[index+1].decode('utf-8')
            # print(city_area_url)
            # print(city_area_name)
            #获取城区的html_xml对象
            html_xml=self.get_html(city_area_url)
            #获取商圈信息
            business_cicle_list=html_xml.xpath('//ul[@data-target="area"]/li[@data-type="bizcircle"][position()>1]/a/@href | //ul[@data-target="area"]/li[@data-type="bizcircle"][position()>1]/a/text()')
            # print(business_cicle_list)
            for index in range(len(business_cicle_list)):
                #将城区和商圈用“-”链接起来
           
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值