#get 链家二手房信息
# import requests #比较好用
# res = requests.get('https://bafdidu.com.cn')
# res.raise_for_status()
import requests,lxml,os
from bs4 import BeautifulSoup
header = {
"Host": 'bj.lianjia.com',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400"}
def get_url(url):
#params是该对象的一个成员变量,是一个数组 getRequest()返回一个对象
res=requests.get(url,headers=header) #res=requests.Request(url,headers='',)
# res.raise_for_status() # =print(res.ok)
# print(res.text)
#~~~~~~~~~出错返回错误码
if res.status_code==200:
# print(res.text)
return res.text #没有保存
else:
print(url,header)
return print(res.status_code)
# res.encoding='utf-8' #指定编码
# print(res.status_code) #返回代码
#——————————
# f = open('Lianjia.html', 'wb')
# for chunk in res.iter_content(100000):
# f.write(chunk) #100000 #78981
# f.close()
# pass
def get_all_url(html):
soup=BeautifulSoup(html,'lxml')
title=soup.select('div.info.clear .title a') #一种方式
# href=title[5].get('href')
#另外一种方式,点进去 获取里面的内容
# print(href)
all_url=[]
for i in title:
href = i.get('href')
all_url.append(href)
return all_url
# return passar_html(html)
def parser_info(info_html):
soup=BeautifulSoup(info_html,'lxml')
title = soup.select('.title .main')[0].text #名字】
total = soup.select('span.total')# 售价
pv = soup.select('.unitPriceValue')# 每平米 1
name = soup.select('a.info')# 小区名字
base = soup.select('.base .content')# 基本属性
transaction = soup.select('.transaction .content')# 交易总属性
l_trans=soup.select('.transaction .content li')# 8中分类属性
# l_t=
# print((l_trans[6].text).split()[0],(l_trans[6].text).split()[1])
# print(l_trans[0].text,l_trans[1].text)
# print(title,type(title))
#TODO:JOIN方法使用。及daf 数据传输 .spli() #'抵押信息 无抵押
data= ', '.join([
title,
total[0].text+"万",
pv[0].text,
name[0].text,
str(base[0].text).strip()+'\n',
l_trans[0].text.strip()+'\n',
l_trans[1].text.strip()+'\n',
l_trans[2].text.strip()+'\n',
l_trans[3].text.strip()+'\n',
l_trans[4].text.strip()+'\n',
(l_trans[5].text).strip()+'\n',
(l_trans[6].text).split()[0]+'\n',
(l_trans[6].text).split()[1]+'\n',
l_trans[7].text.strip()+'\n'+'\n'
#transaction[0].text
])
# print(data)
return data
def save_f(data):
f=open('Lian_info.txt','a',encoding='GBK')
f.write(data)
# data: iter_content erro
def main(url):
pass
if __name__=='__main__':
url='https://bj.lianjia.com/ershoufang/'
html=get_url(url)
get_url(url)
all_urls=get_all_url(html)
# print(all_urls)
#所有的URL调用GEt方法
for url in all_urls:
info_html=get_url(url)
parser_info(info_html)
data = parser_info(info_html)
save_f(data)
# print(info_html)
main(url)
python 爬虫:链家二手房信息爬取练习
最新推荐文章于 2024-04-23 14:37:36 发布