# coding=utf-8
#使用bs4爬取链家网的二手房信息
import codecs
import requests
from bs4 import BeautifulSoup
import os
class HouseInfo:
def __init__(self,imageUrl,title,subTitle,totalPrice,unitPrice):
self.imageUrl=imageUrl
self.title=title
self.subTitle=subTitle
self.totalPrice=totalPrice
self.unitPrice=unitPrice
self.localImage=""
def __str__(self):
#return 'title:{} subTitle:{} totalPrice:{} unitPrice:{} imageUrl:{}'.format(self.title,self.subTitle,self.totalPrice,self.unitPrice,self.imageUrl)
return 'title:'+self.title+'\n subTitle:'+self.subTitle+'\n totalPrice:'+self.totalPrice+'\n unitPrice:'+self.unitPrice+'\n imageUrl:'+self.imageUrl+'localImage:'+self.localImage
class Spider:
def __init__(self):
self.currentPage = -1
if not os.path.exists('d:/python/pachong/lianjia'):
os.makedirs('d:/python/pachong/lianjia')
def setCurrentPage(self,page):
self.currentPage = page
path = 'd:/python/pachong/lianjia/{}'.format(page)
if not os.path.exists(path):
os.makedirs(path)
#获取html数据
def getHtmlData(self,page):
self.setCurrentPage(page)
#访问url
#设置请求头数据
headers = {
# 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
# 'Accept-Encoding':'gzip, deflate, sdch',
# 'Accept-Language':'zh-CN,zh;q=0.8',
# 'Cache-Control':'max-age=0',
# 'Connection':'keep-alive',
# 'Cookie':'_ga=GA1.2.297065422.1513175046;_gid=GA1.2.1857072517.1513175046',
# 'Host':'jandan.net',
# 'Referer':'http://jandan.net/',
# 'Upgrade-Insecure-Requests':'1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
}
url='https://bj.lianjia.com/ershoufang/pg{}/'.format(page)
#发起请求
response=requests.get(url,headers=headers,allow_redirects=False)
#获取数据
html=response.content.decode('utf8')
#print html
self.getDataFromHtml(html)
#解析html,获取相关信息
def getDataFromHtml(self,html):
soup = BeautifulSoup(html,'html.parser')
liList=soup.select(".sellListContent li")
houseList=[]
for li in liList:
imageUrl=li.select("img")[0]['data-original']
print imageUrl
title=(li.select(".title a")[0]).string
#print title
tag=li.select(".address .houseInfo")[0]
subTitle=tag.contents[1].string+tag.contents[2]
#print subTitle
totalPrice=li.select(".priceInfo > .totalPrice > span")[0].string
#print totalPrice
unitPrice = li.select(".priceInfo > .unitPrice > span")[0].string
print unitPrice
houseInfo=HouseInfo(imageUrl,title,subTitle,totalPrice,unitPrice)
houseList.append(houseInfo)
self.saveData(houseList)
#存储数据
def saveData(self,houseList):
with codecs.open("d:/python/pachong/info.txt","a+",encoding="utf8") as f:
for house in houseList:
localPath = self.saveImageData(house.imageUrl)
house.localImage = localPath
f.write(house.__str__())
f.write('\n-------------------------\n')
f.flush()
#开始爬虫
def beginSpider(self,beginPage,size):
for page in range(beginPage,beginPage+size):
self.getHtmlData(page)
#下载图片
def saveImageData(self,imageUrl):
response = requests.get(imageUrl)
data = response.content
nameList = imageUrl.split('/')
name = nameList[len(nameList)-1]
path = 'd:/python/pachong/lianjia/{}/{}'.format(self.currentPage,name)
with codecs.open(path,'wb+') as f:
f.write(data)
return path
if __name__ == "__main__":
spider = Spider()
spider.beginSpider(1,1)
使用bs4爬取链家网的二手房信息
最新推荐文章于 2024-04-12 14:15:06 发布