python爬取链家网二手房信息

python爬取链家网二手房信息

朋友请我帮忙做的期末作业,我自己不是爱说话,直接分享代码,可以直接运行的,期中用的是 python 3.6版本,导包的时候直接在cmd里面用的pip install 包名,其中有的包安装失败,提示pip需要升级,可以看一下这个链接
https://blog.csdn.net/t35254056/article/details/86030552

下面是代码:

在这里插入代码片
#!/usr/bin/env python3

# -*- coding: utf-8 -*-

import json
import requests
from bs4 import BeautifulSoup
import re
import xlsxwriter

def generate_allurl(user_in_nub, user_in_city):  # 生成url
    url = 'http://' + user_in_city + '.lianjia.com/ershoufang/pg{}/'
    for url_next in range(0, int(user_in_nub)):
        yield url.format(url_next)

def get_allurl(generate_allurl,user_in_city):  # 分析url解析出每一页的详细url
    head = {'Host': ''+user_in_city+'.lianjia.com',
            'Referer': 'https://'+user_in_city+'.lianjia.com/ershoufang/',
            #'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                 ' (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
            }
    get_url = requests.get(generate_allurl, headers=head,timeout=5)
    if get_url.status_code == 200:
        re_set = re.compile('<div.*?class="info.*?<div.*?class="title".*?<a.*?href="(.*?)"')
        # print(get_url.text)
        re_get = re.findall(re_set, get_url.text)
        return re_get
    else:
        print('获取详细url失败')

def open_url(re_get,user_in_city):  # 分析详细url获取所需信息
    head = {'Host': ''+user_in_city+'.lianjia.com',
            'Referer': 'https://'+user_in_city+'.lianjia.com/ershoufang/',
            #'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                 ' (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
            }
    res = requests.get(re_get, headers=head,timeout=5)
    if res.status_code == 200:
        info = {}
        soup = BeautifulSoup(res.text, 'lxml')
        info['标题'] = soup.select('.main')[0].text
        info['总价'] = soup.select('.total')[0].text
        info['单位'] = '万'
        info['每平方售价'] = soup.select('.unitPriceValue')[0].text
        info['参考总价'] = soup.select('.taxtext')[0].text
        info['建造时间'] = soup.select('.subInfo')[2].text
        info['小区名称'] = soup.select('.info')[0].text
        info['所在区域'] = soup.select('.info a')[0].text + ':' + soup.select('.info a')[1].text
        info['链家编号'] = str(re_get)[33:].rsplit('.html')[0]

        for i in soup.select('.base li'):
            i = str(i)
            if '</span>' in i or len(i) > 0:
                key, value = (i.split('</span>'))
                info[key[24:]] = value.rsplit('</li>')[0]
        # print(info)
        return info
    else:
        print('获取详细信息失败')

def writer_to_text(info):  # 储存到text
        with open('链家二手房.text', 'a', encoding='utf-8')as f:
            f.write(json.dumps(info, ensure_ascii=False) + '\n')
            f.close()


def pandas_to_xlsx(info,row,worksheet,bold_format):  # 储存到xlsx
    col = 0
    # 用符号标记位置,例如:A列1行
    worksheet.write('A1', '标题', bold_format)
    worksheet.write('B1', '总价', bold_format)
    worksheet.write('C1', '单位', bold_format)
    worksheet.write('D1', '每平方售价', bold_format)
    worksheet.write('E1', '参考总价', bold_format)
    worksheet.write('F1', '建造时间', bold_format)
    worksheet.write('G1', '小区名称', bold_format)
    worksheet.write('H1', '所在区域', bold_format)
    worksheet.write('I1', '链家编号', bold_format)
    worksheet.write('J1', '房屋户型', bold_format)
    worksheet.write('K1', '所在楼层', bold_format)
    worksheet.write('L1', '建筑面积', bold_format)
    worksheet.write('M1', '套内面积', bold_format)
    worksheet.write('N1', '房屋朝向', bold_format)
    worksheet.write('O1', '产权年限', bold_format)

	#从第二行开始写入这里传进来的row是1
    worksheet.write_string(row, col + 0, info['标题'])
    worksheet.write_string(row, col + 1, info['总价'])
    worksheet.write_string(row, col + 2, info['单位'])
    worksheet.write_string(row, col + 3, info['每平方售价'])
    worksheet.write_string(row, col + 4, info['参考总价'])
    worksheet.write_string(row, col + 5, info['建造时间'])
    worksheet.write_string(row, col + 6, info['小区名称'])
    worksheet.write_string(row, col + 7, info['所在区域'])
    worksheet.write_string(row, col + 8, info['链家编号'])
    worksheet.write_string(row, col + 9, info['房屋户型'])
    worksheet.write_string(row, col + 10, info['所在楼层'])
    worksheet.write_string(row, col + 11, info['建筑面积'])
    worksheet.write_string(row, col + 12, info['套内面积'])
    worksheet.write_string(row, col + 13, info['房屋朝向'])
    worksheet.write_string(row, col + 14, info['产权年限'])

# def main(url,user_in_city,row):

    # writer_to_text(open_url(url)) #存储到text文件


if __name__ == '__main__':
    user_in_city = input('输入爬取城市:')
    user_in_nub = input('输入爬取页数:')
     #存储到excel
    workbook = xlsxwriter.Workbook('./链家二手房.xlsx')
    worksheet = workbook.add_worksheet()
   # 设定格式,等号左边格式名称自定义,字典中格式为指定选项
    # bold:加粗,num_format:数字格式
    bold_format = workbook.add_format({'bold': True})

    # 将二行二列设置宽度为15(从0开始)
    # worksheet.set_column(1, 1, 15)
    page = 1
    row = 1
    # pool = Pool()
    for i in generate_allurl(user_in_nub,user_in_city):
        print('开始获取第',page,'页数据.......\n')
        url = get_allurl(i,user_in_city)
        for j in url:
            pandas_to_xlsx(open_url(j,user_in_city),row,worksheet,bold_format)
            row+=1
        print('\n第' ,page ,'页获取完毕')
        page +=1
    print("成功")
    workbook.close()
     #存储到excel结束
        # main(url,user_in_city,row)
        # pool.map(main, [url for url in get_allurl(i)])

  • 3
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值