python爬取链家新房数据

没有搜索到关于python爬虫,所以自己写一个

from bs4 import BeautifulSoup
import requests
import time
import pandas as pd

headers = {
    'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2;.NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; InfoPath.3; .NET4.0C; .NET4.0E)',
    'Accept': 'image/webp,image/*,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Referer': 'http://www.baidu.com/link?url=_andhfsjjjKRgEWkj7i9cFmYYGsisrnm2A-TN3XZDQXxvGsM9k9ZZSnikW2Yds4s&wd=&eqid=c3435a7d00006bd600000003582bfd1f',
    'Connection': 'keep-alive'}
page = ('pg')


def generate_cityurl(user_in_city):  # 生成url
    cityurl = 'https://' + user_in_city + '.lianjia.com/loupan/'
    return cityurl
    # return demjson.encode(res)
    """
    d = json.loads(res.read().decode()).get('data')

    if d is None:
        print("城市首页加载完成")
        return 
    """


def areainfo(url):
    page = ('pg')
    for i in range(1, 39):  # 获取1-100页的数据
        if i == 1:
            i = str(i)
            a = (url + page + i + '/')
            r = requests.get(url=a, headers=headers)
            print(a)
            htmlinfo = r.content
        else:
            i = str(i)
            a = (url + page + i + '/')
            print(a)
            r = requests.get(url=a, headers=headers)
            html2 = r.content
            htmlinfo = htmlinfo + html2
    time.sleep(0.5)
    return htmlinfo


hlist = []


def listinfo(listhtml):
    areasoup = BeautifulSoup(listhtml, 'html.parser')
    ljhouse = areasoup.find_all('div', attrs={'class': 'resblock-desc-wrapper'})

    for house in ljhouse:
        loupantitle = house.find("div", attrs={"class": "resblock-name"})
        loupanname = loupantitle.a.get_text()
        loupantag = loupantitle.find_all("span")
        wuye = loupantag[0].get_text()
        xiaoshouzhuangtai = loupantag[1].get_text()
        location = house.find("div", attrs={"class": "resblock-location"}).get_text()
        jishi = house.find("a", attrs={"class": "resblock-room"}).get_text()
        area = house.find("div", attrs={"class": "resblock-area"}).get_text()
        tag = house.find("div", attrs={"class": "resblock-tag"}).get_text()
        jiage = house.find("div", attrs={"class": "resblock-price"})
        price = jiage.find("div", attrs={"class": "main-price"}).get_text()
        total = jiage.find("div", attrs={"class": "second"})
        totalprice = "暂无"
        if total is not None:
            totalprice = total.get_text()
        h = {'title': loupanname, 'wuye': wuye, 'xiaoshouzhuangtai': xiaoshouzhuangtai, 'location': location.replace("\n", ""),
             'jishi': jishi.replace("\n", ""), 'area': area, 'tag': tag, 'price': price,
             'totalprice': totalprice};
        hlist.append(h)


if __name__ == '__main__':
    user_in_city = input('输入抓取城市:')
    url = generate_cityurl(user_in_city)
    print(url)
    hlist.append(
        {'title': "楼盘名称", 'wuye': "物业类型", 'xiaoshouzhuangtai': "销售状态", 'location': "位置",
         'jishi': "房型", 'area': "面积", 'tag': "标签", 'price': "单价",
         'totalprice': "总价"})
    areahtml = areainfo(url)
    listinfo(areahtml)
    # houseinfo = houseinfo.append(hlist)
    houseinfo = pd.DataFrame(hlist,
                             columns=['title', 'wuye', 'xiaoshouzhuangtai', 'location',
                                      'jishi', 'area', 'tag', 'price',
                                      'totalprice'])
    houseinfo.to_csv('C:\\Users\\czw\\Desktop/链家新房.csv', index=False, encoding="utf_8_sig")

  • 6
    点赞
  • 22
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 7
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

瞬间的未来式

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值