import re
import requests
import redis
from lxml import etree
from fake_useragent import UserAgent
import re
import pymysql
class CoityArea:
def __init__(self):
#初始化Redis链接
self.r=self.get_redis()
def __call__(self, *args, **kwargs):
self.get_city_area()
#redis数据库连接
def get_redis(self):
return redis.Redis(host='127.0.0.1',port=6379,db=1)
def get_city_area(self):
base_url="https://bj.lianjia.com/zufang/"
html_xml=self.get_html(base_url)
city_area_list=html_xml.xpath('//ul[@data-target="area"]//li[position()>1]/a/text() | //ul[@data-target="area"]//li[position()>1]/a/@href')
# print(city_area_list)
# print(len(city_area_list))
for city_area in city_area_list:
if "zufang" in city_area:
city_area="https://bj.lianjia.com"+city_area
# print(city_area)
#将城区信息插入数据库
self.r.rpush("city_area_list",city_area)
#获取指定的url对应的html页面
def get_html(self,url):
headers={
'User-Agent':UserAgent().random
}
response = requests.get(url,headers=headers)
html=response.text
# print(html)
return etree.HTML(html)
class BusinessCicle(CoityArea):
def __call__(self, *args, **kwargs):
self.get_business_cicle()
#通过城区的url获取商圈的url
def get_business_cicle(self):
#查询城区信息
city_area_list=self.r.lrange("city_area_list",0,-1)
# print(city_area_list)
for index in range(0,len(city_area_list),2):
#获取城区的url和城区的名称
city_area_url=city_area_list[index].decode('utf-8')
city_area_name=city_area_list[index+1].decode('utf-8')
# print(city_area_url)
# print(city_area_name)
#获取城区的html_xml对象
html_xml=self.get_html(city_area_url)
#获取商圈信息
business_cicle_list=html_xml.xpath('//ul[@data-target="area"]/li[@data-type="bizcircle"][position()>1]/a/@href | //ul[@data-target="area"]/li[@data-type="bizcircle"][position()>1]/a/text()')
# print(business_cicle_list)
for index in range(len(business_cicle_list)):
#将城区和商圈用“-”链接起来
python爬虫---爬取链家新房
最新推荐文章于 2024-08-13 21:38:27 发布