import requests
from lxml import etree
import pymysql
url = 'https://bj.zu.anjuke.com/fangyuan/huilongguan/p{}/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
# 连接数据库
db = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='123456', database='python', charset='utf8')
# 创建游标
cursor = db.cursor()
# 发送请求
for p in range(3):
p+=1
url=url.format(p)
print(p)
response = requests.get(url, headers=headers)
# 网页保存到本地 测试用
# with open('anjuke.html', 'wb' ) as f:
# f.write(response.content)
html_text = response.text
# print(html_text)
# 获取网页的obj
html_obj = etree.HTML(html_text)
# print(html_obj)
html_list = html_obj.xpath('//div[@class="list-content"]/div')
for i in html_list:
if i.xpath('./div[1]/h3/a'):
# 标题
title = i.xpath('./div[1]/h3/a')[0].text
print(title)
# 庭室
ts = i.xpath('./div[1]/p[1]/text()[1]')[0]
room = ''.join(ts.split())
print(room)
# 大小 平米
size = i.xpath('./div[1]/p[1]/text()[2]')[0]
print(size)
# 楼层
floor = i.xpath('./div[1]/p[1]/text()[3]')[0]
print(floor)
# 地址 + 小区
addres = i.xpath('./div[1]/address/a')[0].text
addres1 = i.xpath('./div[1]/address/text()')[1]
addres2 = addres1+'/'+addres
# 有空格 进行切割 然后在拼接
# 函数split() 不传参就以空格分裂为list ['昌平-回龙观', '回龙观西大街', '/龙华园(西区)']
# print(addres2.split())
# 函数join() 前面是以什么连接为str 昌平-回龙观 回龙观西大街 /龙华园(西区)
# print(' '.join(addres2.split()))
address = ' '.join(addres2.split())
# print(addres3)
# 房租情况
info = i.xpath('./div[1]/p[2]/span/text()')
info = '/'.join(info)
print(info)
# 房租价格
price = i.xpath('./div[2]/p[1]/strong/text()')[0]
print(price)
sql = '''insert into anjuke VALUES (null,'{}','{}','{}','{}','{}','{}','{}')'''.format(title,price,room,size,floor,address,info)
cursor.execute(sql)
db.commit()
cursor.close()
db.close()
python爬取安居客保存mysql
最新推荐文章于 2024-06-02 21:08:37 发布