import re
import requests
import redis
from lxml import etree
from fake_useragent import UserAgent
import re
import pymysql
class CoityArea:
def __init__(self):
#初始化Redis链接
self.r=self.get_redis()
def __call__(self, *args, **kwargs):
self.get_city_area()
#redis数据库连接
def get_redis(self):
return redis.Redis(host='127.0.0.1',port=6379,db=1)
def get_city_area(self):
base_url="https://bj.lianjia.com/zufang/"
html_xml=self.get_html(base_url)
city_area_list=html_xml.xpath('//ul[@data-target="area"]//li[position()>1]/a/text() | //ul[@data-target="area"]//li[position()>1]/a/@href')
# print(city_area_list)
# print(len(city_area_list))
for city_area in city_area_list:
if "zufang" in city_area:
city_area="https://bj.lianjia.com"+city_area
# print(city_area)
#将城区信息插入数据库
self.r.rpush("city_area_list",city_area)
#获取指定的url对应的html页面
def get_html(self,url):
headers={
'User-Agent':UserAgent().random
}
response = requests.get(url,headers=headers)
html=response.text
# print(html)
return etree.HTML(html)
class BusinessCicle(CoityArea):
def __call__(self, *args, **kwargs):
self.get_business_cicle()
#通过城区的url获取商圈的url
def get_business_cicle(self):
#查询城区信息
city_area_list=self.r.lrange("city_area_list",0,-1)
# print(city_area_list)
for index in range(0,len(city_area_list),2):
#获取城区的url和城区的名称
city_area_url=city_area_list[index].decode('utf-8')
city_area_name=city_area_list[index+1].decode('utf-8')
# print(city_area_url)
# print(city_area_name)
#获取城区的html_xml对象
html_xml=self.get_html(city_area_url)
#获取商圈信息
business_cicle_list=html_xml.xpath('//ul[@data-target="area"]/li[@data-type="bizcircle"][position()>1]/a/@href | //ul[@data-target="area"]/li[@data-type="bizcircle"][position()>1]/a/text()')
# print(business_cicle_list)
for index in range(len(business_cicle_list)):
#将城区和商圈用“-”链接起来
business_cicle=business_cicle_list[index]
if index%2==1:
business_cicle=city_area_name+"-"+business_cicle_list[index]
# print(business_cicle)
# count+=1
self.r.rpush("business_cicle",business_cicle)
class Lian(CoityArea):
def __call__(self, *args, **kwargs):
self.count = 1
self.conn_mysql()
self.count_ucid=1
self.get_page_url()
def get_page_url(self):#查询数据库的商圈信息。
business_cicle_list=self.r.lrange("business_cicle",0,-1)
# print(business_cicle_list)
for index in range(0,len(business_cicle_list),2):
business_cicle_url=business_cicle_list[index].decode("utf-8")
business_cicle_name=business_cicle_list[index+1].decode('utf-8')
#拼接完整的商圈url
business_cicle_url="https://bj.lianjia.com"+business_cicle_url
# print('================={}开始下载==================='.format(business_cicle_name))
# print(business_cicle_url,business_cicle_name)
html_xml=self.get_html(business_cicle_url)
#获取做大的页码
max_page=html_xml.xpath('//div/@data-totalpage')
# print(max_page)
#如果获取不到最大页码,则max_page为空列表,跳过本次循环。
if not max_page:
continue
max_page=int(max_page[0])
# print(max_page,type(max_page))
#循环生成分页url
for page in range(1,max_page+1):
print('============第{}页开始下载============='.format(page))
page_url=business_cicle_url+"pg{}".format(page)
# print(page_url)
self.get_data(page_url)
# break
# break
def get_data(self,page_url):#获取数据:
html_xml=self.get_html(page_url)
#缩小范围
div_list_all=html_xml.xpath('//div[@class="content__list"]//div[@class="content__list--item"]')
for div_list in div_list_all:
#获取图片信息:
pic=div_list.xpath('.//img/@data-src')[0]
floor_pic=pic.replace('250x182','2000x1200')
# print(floor_pic)
#获取标题
floor_name = div_list.xpath('.//img/@alt')[0]
# print(floor_name)
# 3 获取价格:
floor_price = div_list.xpath('.//span[@class="content__list--item-price"]/em/text()')[0]
# print(floor_price)
# 4 获取标签:
floor_lable = div_list.xpath(".//p[@class='content__list--item--bottom oneline']/i/text()")
floor_lable = "/".join(floor_lable)#将标签转换为字符串格式
# print(floor_lable)
# 5 发布时间;
floor_time = div_list.xpath(".//p[@class='content__list--item--time oneline']/text()")
floor_time = floor_time[0] if floor_time else ''
# print(floor_time)
# 6 获取位置信息,
floor_city = div_list.xpath('.//p[@class="content__list--item--des"]//text()')[1]#区的信息
floor_local = div_list.xpath('.//p[@class="content__list--item--des"]//text()')[3]#商圈的信息
floor_area = div_list.xpath('.//p[@class="content__list--item--des"]//text()')[6].strip()#房屋的面积
floor_toward = div_list.xpath('.//p[@class="content__list--item--des"]//text()')[8].strip()#朝向
floor_room = div_list.xpath('.//p[@class="content__list--item--des"]//text()')[10].strip()#房屋
# print(floor_room)
room=re.findall("(\d+)室",floor_room)
hall=re.findall("(\d+)厅",floor_room)
tolit=re.findall("(\d+)卫",floor_room)
room=room[0] if room else 0
hall=hall[0] if hall else 0
tolit=tolit[0] if tolit else 0
# print(room,hall,tolit)
#详情url:
detail_url=div_list.xpath('.//p[@class="content__list--item--title twoline"]//a[ @target="_blank"]/@href')[0]
# print(detail_url)
detail_url="https://bj.lianjia.com"+detail_url
# print(detail_url)
fang_dict={
"floor_pic":floor_pic,"floor_name":floor_name,"floor_price":floor_price,
"floor_lable":floor_lable,"floor_time":floor_time,"floor_city":floor_city,
"floor_local":floor_local,"floor_area":floor_area,"floor_toward":floor_toward,
"room":room,"hall":hall,"tolit":tolit,"detail_url":detail_url
}
self.parse_detail_info(fang_dict)
def parse_detail_info(self,fang_dict):
# print(fang_dict)
detail_url=fang_dict['detail_url']
print(detail_url)
#获取详情url对应的xml对象
html_xml=self.get_html(detail_url)
floor = html_xml.xpath('//ul/li[@class="fl oneline"][8]/text()')
floor=floor[0] if floor else ''
# print(floor)
#获取经纪人电话号码:,不在页面中,电话号码在接口中
# phone=html_xml.xpath('//p[@class="content__aside__list--bottom oneline phone"]//text()')
# print(phone)
ucid_id=self.get_ucid(html_xml)
# print(ucid_id)
#获取house_code
house_code = re.findall('zufang/(.*?).html',detail_url)[0]
# print(house_code)
#拼接完整的经纪人url
agent_url="https://bj.lianjia.com/zufang/aj/house/brokers?house_codes={}&position=bottom&ucid={}".format(house_code,ucid_id)
#获取接口的信息
try:
headers={'User-Agent':UserAgent().random}
json_data=requests.get(agent_url,headers=headers).json()
# print(json_data)
phone = json_data.get("data")[house_code][house_code].get("tp_number")
# print(phone)
except Exception as e:
print(e)
#将电话和字典放在字典当中:
else:
fang_dict['floor']=floor
fang_dict['phone']=phone
self.insert_mysql(fang_dict)
def insert_mysql(self,fang_dict):
#将数据拿取出来放入sql语句中
floor_pic=fang_dict['floor_pic']
floor_name=fang_dict['floor_name']
floor_price=fang_dict['floor_price']
floor_lable=fang_dict['floor_lable']
floor_time=fang_dict['floor_time']
floor_city=fang_dict['floor_city']
floor_local=fang_dict['floor_local']
floor_area=fang_dict['floor_area']
floor_toward=fang_dict['floor_toward']
room=fang_dict['room']
hall=fang_dict['hall']
tolit=fang_dict['tolit']
detail_url=fang_dict['detail_url']
floor=fang_dict['floor']
phone=fang_dict['phone']
sql="""
insert into lianjia (floor_pic, floor_name, floor_price, floor_lable, floor_time, floor_city,
floor_local, floor_area, floor_toward, room, hall, tolit, detail_url,floor,phone) values
("{}", "{}", "{}", "{}", "{}", "{}", "{}", "{}", "{}", {}, {}, {}, "{}", "{}", "{}")
""".format(floor_pic,floor_name,floor_price,floor_lable,floor_time,floor_city,floor_local,floor_area,floor_toward,room,hall,tolit,detail_url,floor,phone)
# print(sql)
# try:
# self.cur.execute(sql)
# self.conn.commit()
# print(self.count,sql)
# self.count+=1
# except Exception as e:
# print(e)
# self.conn.rollback()
def conn_mysql(self):
#创建数据库连接对象
self.conn=pymysql.Connect(host='127.0.0.1',user='root',password='admin',database='02180530',charset='utf8')
#创建操作数据库对象
self.cur=self.conn.cursor()
def get_ucid(self,html_xml):
try:
ucid_id =html_xml.xpath('//div[@class="phone__hover--wrapper"]/span[@class="contact__im im__online"]/@data-info')[0]
# print(ucid_id)
self.count_ucid=1
return ucid_id
except Exception as e:
print(e)
if self.count_ucid==3:
return ''
else:
self.count_ucid+=1
return self.get_ucid(html_xml)
if __name__ == '__main__':
# city=CoityArea()
# city()
#实例化BusinessCicle bc为当前类,调用时触发
# bc=BusinessCicle()
# bc()
lian=Lian()
lian()
"""
https://bj.lianjia.com/zufang/aj/house/brokers?house_codes=BJ2262932561259143168&position=bottom&ucid=1000000023007453
https://bj.lianjia.com/zufang/aj/house/brokers?house_codes=BJ2259430864331218944&position=bottom&ucid=1000000020276829
"""
python爬虫---爬取链家新房
最新推荐文章于 2024-08-13 21:38:27 发布