用selenium爬取太原链家网
啥也不说了,直接上代码
#引入selenium、 pandas、openpyxl库
from selenium import webdriver
import pandas as pd
import openpyxl
#定义存储变量
q=[]
sq=[]
xq=[]
mj=[]
cx=[]
hx=[]
yz=[]
#获取网页源代码
for i in range(1,101):
url='https://ty.lianjia.com/zufang/pg'+str(i)
browser = webdriver.Chrome()
browser.get(url)
#解析源代码,提取所需数据信息
try:
for i in browser.find_elements_by_class_name('content__list--item--main'):
q.append(i.find_elements_by_class_name('content__list--item--des')[0].find_elements_by_tag_name('a')[0].text)
sq.append(i.find_elements_by_class_name('content__list--item--des')[0].find_elements_by_tag_name('a')[1].text)
xq.append(i.find_elements_by_class_name('content__list--item--des')[0].find_elements_by_tag_name('a')[2].text)
mj.append(i.find_elements_by_class_name('content__list--item--des')[0].text.replace("\n","").replace(" ","").split("/")[1])
cx.append(i.find_elements_by_class_name('content__list--item--des')[0].text.replace("\n","").replace(" ","").split("/")[2])
hx.append(i.find_elements_by_class_name('content__list--item--des')[0].text.replace("\n","").replace(" ","").split("/")[3])
yz.append(i.find_elements_by_class_name('content__list--item-price')[0].text)
except:
pass
pd.DataFrame({'区':q,'商圈':sq,'小区':xq,'面积':mj,'朝向':cx,'户型':hx,'月租':yz})
data=pd.DataFrame({'区':q,'商圈':sq,'小区':xq,'面积':mj,'朝向':cx,'户型':hx,'月租':yz})
writer=pd.ExcelWriter('s-lianjia.xlsx')
data.to_excel(writer,'爬虫数据')
writer.save()