python爬虫---爬取链家新房

最新推荐文章于 2024-08-13 21:38:27 发布

Mis相遇

最新推荐文章于 2024-08-13 21:38:27 发布

阅读量1k

点赞数 1

本文链接：https://blog.csdn.net/python20180218/article/details/90750801

版权

import re
import requests
import redis
from lxml import etree
from fake_useragent import UserAgent
import re
import pymysql
class CoityArea:
    def __init__(self):
        #初始化Redis链接
       self.r=self.get_redis()
    def __call__(self, *args, **kwargs):
       self.get_city_area()
    #redis数据库连接
    def get_redis(self):
        return  redis.Redis(host='127.0.0.1',port=6379,db=1)
    def get_city_area(self):
        base_url="https://bj.lianjia.com/zufang/"
        html_xml=self.get_html(base_url)
        city_area_list=html_xml.xpath('//ul[@data-target="area"]//li[position()>1]/a/text() | //ul[@data-target="area"]//li[position()>1]/a/@href')
        # print(city_area_list)
        # print(len(city_area_list))
        for city_area in city_area_list:
            if "zufang" in city_area:
                city_area="https://bj.lianjia.com"+city_area
                # print(city_area)
            #将城区信息插入数据库
            self.r.rpush("city_area_list",city_area)
    #获取指定的url对应的html页面
    def get_html(self,url):
        headers={
            'User-Agent':UserAgent().random
        }
        response = requests.get(url,headers=headers)
        html=response.text
        # print(html)
        return etree.HTML(html)
class BusinessCicle(CoityArea):
    def __call__(self, *args, **kwargs):
        self.get_business_cicle()
    #通过城区的url获取商圈的url
    def get_business_cicle(self):
        #查询城区信息
        city_area_list=self.r.lrange("city_area_list",0,-1)
        # print(city_area_list)
        for index in range(0,len(city_area_list),2):
            #获取城区的url和城区的名称
            city_area_url=city_area_list[index].decode('utf-8')
            city_area_name=city_area_list[index+1].decode('utf-8')
            # print(city_area_url)
            # print(city_area_name)
            #获取城区的html_xml对象
            html_xml=self.get_html(city_area_url)
            #获取商圈信息
            business_cicle_list=html_xml.xpath('//ul[@data-target="area"]/li[@data-type="bizcircle"][position()>1]/a/@href | //ul[@data-target="area"]/li[@data-type="bizcircle"][position()>1]/a/text()')
            # print(business_cicle_list)
            for index in range(len(business_cicle_list)):
                #将城区和商圈用“-”链接起来