使用bs4爬取链家网的二手房信息

最新推荐文章于 2024-04-12 14:15:06 发布

xxuffei

最新推荐文章于 2024-04-12 14:15:06 发布

阅读量1k

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/xxuffei/article/details/78798440

版权

python 专栏收录该内容

54 篇文章 0 订阅

订阅专栏

# coding=utf-8
#使用bs4爬取链家网的二手房信息
import codecs
import requests
from bs4 import BeautifulSoup
import os


class HouseInfo:
    def __init__(self,imageUrl,title,subTitle,totalPrice,unitPrice):
        self.imageUrl=imageUrl
        self.title=title
        self.subTitle=subTitle
        self.totalPrice=totalPrice
        self.unitPrice=unitPrice
        self.localImage=""
    def __str__(self):
        #return 'title:{} subTitle:{} totalPrice:{} unitPrice:{} imageUrl:{}'.format(self.title,self.subTitle,self.totalPrice,self.unitPrice,self.imageUrl)
        return 'title:'+self.title+'\n subTitle:'+self.subTitle+'\n totalPrice:'+self.totalPrice+'\n unitPrice:'+self.unitPrice+'\n imageUrl:'+self.imageUrl+'localImage:'+self.localImage
class Spider:
    def __init__(self):
        self.currentPage = -1
        if not os.path.exists('d:/python/pachong/lianjia'):
            os.makedirs('d:/python/pachong/lianjia')

    def setCurrentPage(self,page):
        self.currentPage = page
        path = 'd:/python/pachong/lianjia/{}'.format(page)
        if not os.path.exists(path):
            os.makedirs(path)

    #获取html数据
    def getHtmlData(self,page):
        self.setCurrentPage(page)
        #访问url
        #设置请求头数据
        headers = {
#             'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
#             'Accept-Encoding':'gzip, deflate, sdch',
#             'Accept-Language':'zh-CN,zh;q=0.8',
#             'Cache-Control':'max-age=0',
#             'Connection':'keep-alive',
#             'Cookie':'_ga=GA1.2.297065422.1513175046;_gid=GA1.2.1857072517.1513175046',
#             'Host':'jandan.net',
#             'Referer':'http://jandan.net/',
#             'Upgrade-Insecure-Requests':'1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
            }
        url='https://bj.lianjia.com/ershoufang/pg{}/'.format(page)
        #发起请求
        response=requests.get(url,headers=headers,allow_redirects=False)
        #获取数据
        html=response.content.decode('utf8')
        #print html
        self.getDataFromHtml(html)

    #解析html,获取相关信息
    def getDataFromHtml(self,html):
        soup = BeautifulSoup(html,'html.parser')
        liList=soup.select(".sellListContent li")
        houseList=[]
        for li in liList:
            imageUrl=li.select("img")[0]['data-original']
            print imageUrl
            title=(li.select(".title a")[0]).string
            #print title
            tag=li.select(".address .houseInfo")[0]
            subTitle=tag.contents[1].string+tag.contents[2]
            #print subTitle
            totalPrice=li.select(".priceInfo > .totalPrice > span")[0].string
            #print totalPrice
            unitPrice = li.select(".priceInfo > .unitPrice > span")[0].string
            print unitPrice
            houseInfo=HouseInfo(imageUrl,title,subTitle,totalPrice,unitPrice)
            houseList.append(houseInfo)
        self.saveData(houseList)

    #存储数据
    def saveData(self,houseList):
        with codecs.open("d:/python/pachong/info.txt","a+",encoding="utf8") as f:
            for house in houseList:
                localPath = self.saveImageData(house.imageUrl)
                house.localImage = localPath
                f.write(house.__str__())
                f.write('\n-------------------------\n')
                f.flush()


    #开始爬虫
    def beginSpider(self,beginPage,size):
        for page in range(beginPage,beginPage+size):
            self.getHtmlData(page)

    #下载图片
    def saveImageData(self,imageUrl):
        response = requests.get(imageUrl)
        data = response.content
        nameList = imageUrl.split('/')
        name = nameList[len(nameList)-1]
        path = 'd:/python/pachong/lianjia/{}/{}'.format(self.currentPage,name)
        with codecs.open(path,'wb+') as f:
            f.write(data)

        return path
if __name__ == "__main__":
    spider = Spider()
    spider.beginSpider(1,1)