我做了一个在链家网上爬取租房信息主要有三个功能:
一.爬取租房价格,存入.csv文件,因为这个网站url比较简单,所以我直接写进列表
import requests
import urllib.request
from bs4 import BeautifulSoup#用bs4爬取
import bs4
#使用urllib库解决出现403问题
def text(url):
fp = urllib.request.urlopen(url)
html=fp.read()
ht=html.decode("utf8")
fp.close()
Soup=BeautifulSoup(ht,"html.parser")
return Soup
Soup=text("https://cd.lianjia.com/zufang/")
P=[]#存放整租信息
Regina=Soup.find("div",{"id":"filter"}).find("ul",{"data-target":"area"}).find_all("li")
Type=["l0","l1","l2","l3"]
S=["地区/户型","一居","二居","三居","四居+"]
met=["/rt200600000001","/rt200600000002"]
L=[]#存放合租租信息
L.append(S)
P.append(S)
j=0
example=5 #爬取的样品数
#爬取信息求平均值
for item in Regina:
if j==0:
j=1
continue
else:
for m in range(0,len(met)):
uri="https://cd.lianjia.com{}{}".format(item.find("a").attrs.get("href"),met[m])
T=[]
T.append(str(item.find("a").string))
for it in Type:
url=uri+"{}".format(it)
soup=text(url)
Price=soup.find_all("span",{"class":"content__list--item-price"})
i=0
total=0
if len(Price)==0:#没有数据的存入0
T.append("0")
continue
else: