第一步是数据爬取:
import xlwt
from bs4 import BeautifulSoup
from selenium import webdriver
import time,random
from selenium.webdriver import ChromeOptions
def main():
baseurl = "https://xa.fang.anjuke.com/loupan/all/p"
datalist = getData(baseurl)
savepath = "安居客爬虫数据.xls"
saveData(datalist, savepath)
def getData(baseurl):
datalist = []
headers = {
"cookie": "isp=true; isp=true; aQQ_ajkguid=C762D025-2585-194F-7317-8EC539296440; _ga=GA1.2.18092521.1626051248; _gid=GA1.2.2133028006.1626051248; id58=e87rkGDrkq+BJ/A5/JzXAg==; 58tj_uuid=4604ab87-5912-4903-a6dc-28ae7ae20bc1; als=0; isp=true; wmda_uuid=dfa952c1ee878d222eeb947c5618cfd7; wmda_new_uuid=1; wmda_visited_projects=%3B8788302075828; cmctid=483; xxzl_cid=629248c0af8e4b1e8a3219e3d1e090d7; xzuid=144b3c94-6fb0-45e0-b7f8-6e43c085f8a4; ctid=31; sessid=A07D28BD-B371-B893-463C-SX0712140406; obtain_by=2; twe=2; wmda_session_id_8788302075828=1626069849725-02cb50ff-be87-431f; init_refer=; new_uv=3; lp_lt_ut=1023adc4fcf5533ab348b520b9a4ce05; ved_loupans=472113; new_session=0",
"referer": "https://xa.fang.anjuke.com/loupan/all/p13/",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
}
for i in range(0,26):
url = baseurl+ str(i)
driver = webdriver.Chrome()
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
trs = soup.select("div.key-list>.item-mod")
print(len(trs))
for div in trs:
data=[]
name = div.select(".infos>.lp-name>.items-name")[0].get_text()
print("名字:" + name)
data.append(name)
tstate = div.select(".tags-wrap>.tag-panel>i:nth-of-type(1)")
if tstate:
state = tstate[0].get_text()
print(state)
data.append(state)
else:
data.append(" ")
position = div.select(".address>.list-map")[0].get_text()
print("位置:" + position)
data.append(position)
thuxing = div.select("div.infos > a.huxing>span:nth-of-type(1)")
if thuxing:
huxing = thuxing[0].get_text()
print(huxing)
data.append(huxing)
else:
data.append(" ")
thuxing2 = div.select("div.infos > a.huxing>span:nth-of-type(2)")
if thuxing:
huxing2 = thuxing[0].get_text()
print(huxing2)
data.append(huxing2)
else:
data.append(" ")
tarea = div.select("div.infos > a.huxing > span.building-area")
if tarea:
area = tarea[0].get_text()
print(area)
data.append(area)
else:
data.append(" ")
ttpe = div.select(".tags-wrap>.tag-panel>.wuyetp")
if ttpe:
type = ttpe[0].get_text()
print("类型:" + type)
data.append(type)
else:
data.append(" ")
tshuxing1 = div.select(".tags-wrap>.tag-panel>span:nth-of-type(1)")
if tshuxing1:
shuxing1 = tshuxing1[