“”"
1导入各种应用模块
import redis #用来进行redis数据库连接
import requests #requests 请求
from lxml import etree #运用xpath分析
from fake_useragent import UserAgent #随机产生请求头
import re #运用正则
import pymysql #用来连接mysql数据库
2获取指定url对应的xml界面(便于进行xpath分析)
方法可采用request或selenium 优先采用requests方法(速度快)
定义的函数属于 3 类
3 定义一个城市类(比如全国各个城市)
在其中定义函数用来获取城市信息,通过给定网站url
获取对应城市名称和url(可能url需要拼接)
将城市信息存入redis中(这样下次可以直接打开redis内容,不用再次请求原url等)
4定义一个城市对应各个区的类并继承城市类
通过城市url获取各区url(包括城市各区的名称)
具体方法同城市的获取
5定义一个专门获取信息等类(可在里面获取最大页,详情页分析)
一开始要连接数据库,不要最后连接
对4中城市各区信息进行循环获取
获取最大页码(可能需要拼接url)
通过5中分析获取指定分页url数据,之后缩小范围进行分析
比如获取图片,名称,描述,价格,城区,面积,房间信息,时间
获取详情页url 并拼接,将上边分析的数据放入一个字典中
单独定义详情页信息函数 (从城市各区跳转的详情界面)
获取里面需要的信息 方法同城市各区获取内容(有的信息可能存在接口中)
有时候需要异常处理
将详情页获取的信息放入以前字典中
插入到mysql数据库(需要将字典中的数据拿出来)(有事需要异常处理)
创建数据库连接对象
#注意:接口最好单独在一个函数中进行分析
“”"
对应完整代码:
import redis
import requests
from lxml import etree
from fake_useragent import UserAgent
import re
import pymysql
class CityArea:
def __init__(self):
# 初始化redis连接
self.r = self.get_redis()
def __call__(self, *args, **kwargs):
self.get_city_area()
# redis数据库连接
def get_redis(self):
return redis.Redis(host="127.0.0.1", port=6379, db=1)
def get_city_area(self):
# 获取城区信息
base_url = "https://bj.lianjia.com/zufang/"
html_xml = self.get_html(base_url)
city_area_list = html_xml.xpath("//ul[@data-target='area']/li[position()>1]/a/@href | "
"//ul[@data-target='area']/li[position()>1]/a/text()")
print(city_area_list)
print(len(city_area_list))
for city_area in city_area_list:
if "zufang" in city_area:
city_area = "https://bj.lianjia.com" + city_area
print(city_area)
# 将城区信息插入数据库
self.r.rpush("city_area_list", city_area)
# 获取指定url对应xml页面
def get_html(self, url):
headers = {"User-Agent": UserAgent().random}
response = requests.get(url, headers=headers)
html = response.text
# print(html)
return etree.HTML(html)
class BusinessCircle(CityArea):
def __call__(self, *args, **kwargs):
self.get_business_circle()
# 通过城区url获取商圈url
def get_business_circle(self):
count = 1
# 查询城区信息
city_area_list = self.r.lrange("city_area_list", 0, -1)
# print(city_area_list)
for index in range(0, len(city_area_list), 2):
# print(index)
# 分别获取城区url和城区的名称
city_area_url = city_area_list[index].decode("utf-8")
city_area_name = city_area_list[index+1].decode("utf-8")
print(city_area_url, city_area_name)
# 获取城区url xml对象
html_xml = self.get_html(city_area_url)
# 获取商圈信息
business_circle_list = html_xml.xpath("//div[@id='filter']/ul[4]/li[position()>1]/a/@href | "
"//div[@id='filter']/ul[4]/li[position()>1]/a/text()")
print(business_circle_list)
for index in range(len(business_circle_list)):
# 获取商圈列表中的信息
business_circle = business_circle_list[index]
# 将城区和商圈用-连接起来 存入数据库
if index % 2 == 1:
business_circle = city_area_name + "-" + business_circle_list[index]
print(count, business_circle, type(business_circle))
# print(type(business_circle))
count += 1
# 存入数据库
self.r.rpush("business_circle_list", business_circle)
# break
class Lian(CityArea):
def __call__(self, *args, **kwargs):
self.count = 1
# 连接数据库 要在开始调用 不要放在最后
self.conn_mysql()
self.count_ucid = 1
self.get_page_url()
def get_page_url(self):
# 查询数据库中的商圈信息
business_circle_list = self.r.lrange("business_circle_list", 0, -1)
# print(business_circle_list)
# 循环获取商圈url
for index in range(0, len(business_circle_list), 2):
# 分别获取商圈url和商圈名称
business_circle_url = business_circle_list[index].decode("utf-8")
# 拼接完整的商圈url
business_circle_url = "https://bj.lianjia.com" + business_circle_url
business_circle_name = business_circle_list[index+1].decode("utf-8")
print("==================={}开始下载====================".format(business_circle_name))
print(business_circle_url, business_circle_name)
# 获取商圈url指定xml页面
html_xml = self.get_html(business_circle_url)
# 获取最大页码
max_page = html_xml.xpath("//div[@class='content__pg']/@data-totalpage")
# 如果获取不到最大页码 则max_page 为空列表 然后跳过本次循环
if not max_page:
continue
max_page = int(max_page[0])
# print(max_page, type(max_page))
# 循环生成分页url
for page in range(1, max_page+1):
# 拼接完整的分页url
page_url = business_circle_url + "pg{}/".format(page)
# print(page_url)
# 获取数据
self.get_data(page_url)
# break
# break
# 获取指定分页url的数据
def get_data(self, page_url):
# 获取分页url页面
html_xml = self.get_html(page_url)
# 缩小范围
div_list = html_xml.xpath("//div[@class='content__list']/div")
for div in div_list:
# 图片
pic = div.xpath(".//img/@data-src")[0]
pic = pic.replace("250x182", "2500x1800")
# print(pic)
# 标题
title = div.xpath(".//p[@class='content__list--item--title twoline']/a/text()")[0].strip()
# print(title)
# 城区
city_area = div.xpath(".//p[@class='content__list--item--des']/a[1]/text()")[0]
# 商圈
business_circle = div.xpath(".//p[@class='content__list--item--des']/a[2]/text()")[0]
# print(city_area, business_circle)
# 面积
area = div.xpath(".//p[@class='content__list--item--des']//text()[4]")
area = area[0].strip() if area else "" # 空值处理
# print(area)
# 朝向
toward = div.xpath(".//p[@class='content__list--item--des']//text()[5]")[0].strip()
# print(toward)
# 房间信息
fang_info = div.xpath(".//p[@class='content__list--item--des']//text()[6]")[0].strip()
# print(fang_info)
room = re.findall("(\d+)室", fang_info) # 室
hall = re.findall("(\d+)厅",fang_info) # 厅
toilet = re.findall("(\d+)卫", fang_info) # 卫
# 空值处理
room = int(room[0]) if room else 0
hall = int(hall[0]) if hall else 0
toilet = int(toilet[0]) if toilet else 0
# print(room, hall, toilet)
# 发布时间
publish_date = div.xpath(".//p[@class='content__list--item--time oneline']/text()")[0]
# print(publish_date)
# 标签
sign_list = div.xpath(".//p[@class='content__list--item--bottom oneline']/i/text()")
# print(sign_list)
# 将标签转换为字符串
sign = "#".join(sign_list)
# print(sign)
# 价格
price = div.xpath(".//em/text()")[0]
# print(price)
# 详情url
detail_url = div.xpath(".//p[@class='content__list--item--title twoline']/a/@href")[0]
# 拼接完整的详情url
detail_url = "https://bj.lianjia.com" + detail_url
# print(detail_url)
fang_dict = {
"pic": pic, "title": title, "city_area": city_area, "business_circle": business_circle,
"area": area, "toward": toward, "room": room, "hall": hall, "toilet": toilet,
"publish_date": publish_date, "sign": sign, "price": price, "detail_url": detail_url
}
self.parse_detail(fang_dict)
# 解析详情页
def parse_detail(self, fang_dict):
# print(fang_dict)
detail_url = fang_dict['detail_url']
# print(detail_url)
# 获取详情url对应的xml对象
html_xml = self.get_html(detail_url)
floor = html_xml.xpath("//ul/li[@class='fl oneline'][8]/text()")
floor = floor[0] if floor else ""
# print(floor)
# 获取经纪人电话号码 不在页面中
# 电话号码在接口中
# phone = html_xml.xpath(".//p[@class='content__aside__list--bottom oneline phone']/text()")
# print(phone)
# 获取经纪人id号 ucid
ucid = self.get_ucid(html_xml)
# print(ucid)
# 获取house_code
house_code = re.findall("zufang/(.*?).html", detail_url)[0]
# print(house_code)
# 拼接完整的经纪人接口
agent_url = "https://bj.lianjia.com/zufang/aj/house/brokers?" \
"house_codes={}&position=bottom" \
"&ucid={}".format(house_code,ucid)
# print(agent_url)
try:
# 获取接口中的信息
headers = {"User-Agent": UserAgent().random}
json_data = requests.get(agent_url, headers=headers).json()
# print(json_data)
phone = json_data.get("data")[house_code][house_code].get("tp_number")
# print(phone)
except Exception as e:
print(e)
phone = ''
# 将电话和楼层信息放到fang_dict中
fang_dict["floor"] = floor
fang_dict["phone"] = phone
self.insert_mysql(fang_dict)
def insert_mysql(self, fang_dict):
# 将数据拿取出来 放入sql语句中
pic = fang_dict["pic"]
title = fang_dict["title"]
city_area = fang_dict["city_area"]
business_circle = fang_dict["business_circle"]
area = fang_dict["area"]
toward = fang_dict["toward"]
room = fang_dict["room"]
hall = fang_dict["hall"]
toilet = fang_dict["toilet"]
publish_date = fang_dict["publish_date"]
sign = fang_dict["sign"]
price = fang_dict["price"]
detail_url = fang_dict["detail_url"]
floor = fang_dict["floor"]
phone = fang_dict["phone"]
sql = '''
insert into lianjia (pic, title, city_area, business_circle, area, toward, room,
hall, toilet, publish_date, sign, price, detail_url, floor, phone) values
("{}", "{}", "{}", "{}", "{}", "{}",
{}, {}, {}, "{}", "{}", "{}", "{}",
"{}", "{}")
'''.format(pic,title,city_area,business_circle,area,toward,room,hall,toilet,publish_date,sign,price,detail_url,floor,phone)
try:
# 执行sql语句并提交
self.cur.execute(sql)
self.conn.commit()
print(self.count, sql)
self.count += 1
except Exception as e:
print(e)
# 回滚 执行sql语句 要么执行成功 要么执行失败
self.conn.rollback()
def conn_mysql(self):
# 创建数据库的连接对象
# 字符集是utf8 不是utf-8
self.conn = pymysql.connect(host="127.0.0.1", user="root",password='123',
database="刘争", charset="utf8")
# 创建操作数据库的对象
self.cur = self.conn.cursor()
def get_ucid(self, html_xml):
try:
ucid = html_xml.xpath("//span[@class='contact__im im__online']/@data-info")[0]
# print(ucid)
self.count_ucid = 1
return ucid
except Exception as e:
print(e)
if self.count_ucid == 3:
return ""
else:
self.count_ucid += 1
return self.get_ucid(html_xml)
# ucid = self.get_ucid() = self.get_ucid(html_xml) = ucid
if __name__ == '__main__':
# cityarea = CityArea()
# cityarea()
#实例化BusinessCircle bc为当前类的对象 调用时触发__call__
# bc = BusinessCircle()
# bc()
lian = Lian()
lian()
‘’’
电话接口分析:
https://bj.lianjia.com/zufang/aj/house/brokers?house_codes=BJ2259333770690183168&position=bottom&ucid=1000000026012783
https://bj.lianjia.com/zufang/aj/house/brokers?house_codes=BJ2234691835526389760&position=bottom&ucid=1000000023002201
‘’’