爬取链家网数据(精简版)
直接上代码
一、爬取太原链家网1页数据:
#三个功能函数,一个主函数
import requests
from bs4 import BeautifulSoup
#1.获取网页源代码的函数
def getHTMLText(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
#2.解析源代码,提取信息
def fillUnivList(ulist,html):
soup=BeautifulSoup(html,'html.parser')
for t in soup.find_all(attrs={'class':'content__list--item--main'}):
ulist.append([t.find_all('a')[3].string.replace("\n","").replace(" ",""),
t.find_all('p')[1].text.replace("\n","").replace(" ","").split("/")[1],
t.find_all('p')[1].text.replace("\n","").replace(" ","").split("/")[2],
t.em.string])
#3.输出租房信息
def printUnivList(ulist,num):
print('{:70}\t{:11}\t{:11}\t{:11}'.format("小区名称","租房面积","房屋朝向","租房价格"))
for i in range(num):
u=ulist[i]
print('{:70}\t{:11}\t{:11}\t{:11}'.format(u[0],u[1],u[2],u[3]))
#主函数
def main():
uinfo=[]
url='https://ty.lianjia.com/zufang/'
html=getHTMLText(url)
fillUnivList(uinfo,html)
printUnivList(uinfo,30)
main()
二、爬取太原链家网100页数据:
#三个功能函数,一个主函数
import requests
from bs4 import BeautifulSoup
#1.获取网页源代码的函数
def getHTMLText(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
#2.解析源代码,提取信息
def fillUnivList(ulist,html):
soup=BeautifulSoup(html,'html.parser')
try:
for t in soup.find_all(attrs={'class':'content__list--item--main'}):
ulist.append([t.find_all('a')[3].string,
t.find_all('p')[1].text.replace("\n","").replace(" ","").split("/")[1],
t.find_all('p')[1].text.replace("\n","").replace(" ","").split("/")[2],
t.em.string])
except:
pass
#3.输出租房信息
def printUnivList(ulist,num):
try:
for i in range(num):
u=ulist[i]
print('{:55}\t{:15}\t{:15}\t{:15}'.format(u[0],u[1],u[2],u[3]))
except:
pass
#主函数
def main():
print('{:55}\t{:15}\t{:15}\t{:15}'.format("小区名称","租房面积","房屋朝向","租房价格"))
urllist=[]
for a in range(1,101):
url='https://ty.lianjia.com/zufang/pg{}/#contentList'.format(a)
urllist.append(url)
for url in urllist:
html=getHTMLText(url)
uinfo=[]
fillUnivList(uinfo,html)
x=len(uinfo)
printUnivList(uinfo,x)
main()