爬取链家网站北京租房信息
import requests
import re
import pymysql
db = pymysql.connect('localhost', 'root', '126315', 'petzhang')
cursor = db.cursor()
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.9 Safari/537.36'}
def getdata(n):
first_url = 'https://bj.lianjia.com/zufang/chaoyang/pg{}'.format(n)
response = requests.get(first_url, headers=headers)
loc_data1 = re.findall(r'<a target="_blank" href="/zufang/(.*?)/a>-<a href="/zufang/', response.text)
loca1 = []
for m in range(len(loc_data1)):
locdata1 = re.findall(r'[\u4e00-\u9fa5]+', loc_data1[m])
locdata1 = ''.join(locdata1)
loca1.append((locdata1))
loca2 = re.findall(r'target="_blank">(.*?)</a>-<a title=', response.text)
loca3 = re.findall(r'</a>-<a title="(.*?)" href=', response.text)
detail_url = re.findall(r'<a target="_blank" href="/zufang/BJ(.*?)">', response.text)
url1 = []
title1 = []
price1=[]
method1=[]
leixing1=[]
square1=[]
chaoxiang1=[]
ruzhushijian1=[]
louceng1=[]
dianti1=[]
yongshui1=[]
yongdian1=[]
ranqi1=[]
cainuan1=[]
zuqi1=[]
agent1=[]
phone1=[]
for i in range(len(detail_url)):
detailurl = 'https://bj.lianjia.com/zufang/BJ{}'.format(detail_url[i])
detail_data = requests.get(detailurl, headers=headers)
url = detailurl
url1.append(url)
title = re.findall(r'<p class="content__title">(.*?)</p>', detail_data.text)
title1.append(title)
price=re.findall(r'<span>(.*?)</span>元/月', detail_data.text)
price1.append(price)
method = re.findall(r'<li><span class="label">租赁方式:</span>(.*?)</li>', detail_data.text)
method1.append(method)
leixing = re.findall(r'<li><span class="label">房屋类型:</span>(.*?)</li>', detail_data.text)
leixing1.append(leixing)
square = re.findall(r'<li class="fl oneline">面积:(.*?)</li>', detail_data.text)
square1.append(square)
chaoxiang = re.findall(r'<li class="fl oneline">朝向:(.*?)</li>', detail_data.text)
chaoxiang1.append(chaoxiang)
ruzhushijian = re.findall(r'<li class="fl oneline">入住:(.*?)</li>', detail_data.text)
ruzhushijian1.append(ruzhushijian)
louceng = re.findall(r'<li class="fl oneline">楼层:(.*?)</li>', detail_data.text)
louceng1.append(louceng)
dianti = re.findall(r'<li class="fl oneline">电梯:(.*?)</li>', detail_data.text)
dianti1.append(dianti)
yongshui = re.findall(r'<li class="fl oneline">用水:(.*?)</li>', detail_data.text)
yongshui1.append(yongshui)
yongdian = re.findall(r'<li class="fl oneline">用电:(.*?)</li>', detail_data.text)
yongdian1.append(yongdian)
ranqi = re.findall(r'<li class="fl oneline">燃气:(.*?)</li>', detail_data.text)
ranqi1.append(ranqi)
cainuan = re.findall(r'<li class="fl oneline">采暖:(.*?)</li>', detail_data.text)
cainuan1.append(cainuan)
zuqi = re.findall(r'<li class="fl oneline">租期:(.*?)</li>', detail_data.text)
zuqi1.append(zuqi)
agent = re.findall(r'name":"(.*?)","office', detail_data.text)
agent1.append(agent)
phone = re.findall(r'phone400":"(.*?)","phone', detail_data.text)
phone1.append(phone)
print('page'+'-'+str(n))
try:
for j in range(len(title1)):
sql = 'insert into `chaoyang` (`房源标题`,`网址`,`市区`,`商圈`,`小区`,`租赁方式`,`价格`,`房屋类型`,`面积`,`朝向`,`入住`,`楼层`,`电梯`,`用水`,`用电`,`燃气`,`采暖`,`租期`,`代理人`,`联系方式`) values ("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")'\
.format(title1[j][0], url1[j], loca1[j], loca2[j], loca3[j], method1[j][0],
price1[j][0], leixing1[j][0], square1[j][0], chaoxiang1[j], ruzhushijian1[j][0],
louceng1[j][0], dianti1[j][0], yongshui1[j][0], yongdian1[j][0], ranqi1[j][0],
cainuan1[j][0], zuqi1[j][0], agent1[j][0], phone1[j][0])
cursor.execute(sql)
db.commit()
print("已存储" + title1[j][0])
except Exception as e:
print(e)
'''
def savedata():
try:
for j in range(len(title1)):
# print(title1[j][0])
sql = 'insert into `chaoyang` (`房源标题`,`网址`,`市区`,`商圈`,`小区`,`租赁方式`,`价格`,`房屋类型`,`面积`,`朝向`,`入住`,`楼层`,`电梯`,`用水`,`用电`,`燃气`,`采暖`,`租期`,`代理人`,`联系方式`) values ("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")' \
.format(title1[j][0], url1[j], loca1[j], loca2[j], loca3[j], method1[j][0],
price1[j][0], leixing1[j][0], square1[j][0], chaoxiang1[j], ruzhushijian1[j][0],
louceng1[j][0], dianti1[j][0], yongshui1[j][0], yongdian1[j][0], ranqi1[j][0],
cainuan1[j][0], zuqi1[j][0], agent1[j][0], phone1[j][0])
cursor.execute(sql)
db.commit()
print("已存储" + title1[j][0])
except Exception as e:
print(e)
'''
if __name__ == '__main__':
for n in range(1,101):
getdata(n)
![在这里插入图片描述](https://img-blog.csdnimg.cn/20200517125156677.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2Vsb25nZXJ6aGM=,size_16,color_FFFFFF,t_70)