python 抓取58商铺信息

python 58爬取商铺信息

个人接到一个58爬虫, 58数据都在html里面, 暂时没有太严重的反爬措施
技术交流:18611372505

在这里插入图片描述

封装获取 html方法

requests.packages.urllib3.disable_warnings()
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}


def get_html(url):
    '''得到页面的etree对象'''
    html_obj = requests.get(url, headers=headers, verify=False)
    html_obj = html_obj.content.decode()  # 解决乱码问题
    tree = etree.HTML(html_obj)  # 转化为页面的etree对象
    return tree

使用 etree 解析html

def get_data(tree):
    """获取一页的房子数据"""
    # 建立字典
    info_dicts = defaultdict(list)
    div_list = tree.xpath('//ul[@class="list-main-style"]/li')
    for div in div_list:
        # 名称
        title = div.xpath('.//span[@class="title_des"]/text()')[0]
        info_dicts['名称'].append(title)
        # 行政区
        descript = div.xpath('.//div[@class="list-info "]/p[1]/span[1]/text()')[0]
        info_dicts['行政区'].append(descript)
        # 详细地址
        mr5 = div.xpath('.//div[@class="list-info "]/p[1]/span[2]/text()')[0]
        info_dicts['位置'].append(mr5)
        # 面积
        area = div.xpath('.//div[@class="area"]/p/span[1]/text()')[0]
        info_dicts['面积(平方)'].append(''.join(area))
        # 租金
        price_monthly = div.xpath('.//div[@class="price "]/p[1]/span/text()')
        info_dicts['租金'].append(''.join(price_monthly)) if price_monthly else info_dicts['租金'].append('面议')
        # 日租金
        price_daily = div.xpath('.//div[@class="price "]/p[2]//text()')
        info_dicts['日租金'].append(price_daily[0].strip()) if price_daily else info_dicts['日租金'].append('面议')

    next = tree.xpath('//a[@class="next"]/@href')
    data = pd.DataFrame(info_dicts)
    if next:
        return data, next
    else:
        return data, False

循环访问然后添加到pandas.DataFrame,然后导入xls文件

# 主程序部分
data = pd.DataFrame()
url = 'https://bj.58.com/shangpucz/?from=zf&gposLastIndex=139&PGT'
while True:
    print("开始抓取链接: %s " % url)
    try:
        tree = get_html(url)
    except Exception as e:
        print("需要验证码了")
        time.sleep(20)
        continue
    an_data, next = get_data(tree)
    data = data.append(an_data, ignore_index=True)
    if next:
        url = next[0]
    else:
        break
    time.sleep(random.randint(15, 27))

# 持久化存储
data.to_excel('58爬虫.xls', index=False)

完整代码

import random
from collections import defaultdict
import requests
import pandas as pd
from lxml import etree
import re
import time

requests.packages.urllib3.disable_warnings()
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}


def get_html(url):
    '''得到页面的etree对象'''
    html_obj = requests.get(url, headers=headers, verify=False)
    html_obj = html_obj.content.decode()  # 解决乱码问题
    tree = etree.HTML(html_obj)  # 转化为页面的etree对象
    return tree


def get_data(tree):
    """获取一页的房子数据"""
    # 建立字典
    info_dicts = defaultdict(list)
    div_list = tree.xpath('//ul[@class="list-main-style"]/li')
    for div in div_list:
        # 名称
        title = div.xpath('.//span[@class="title_des"]/text()')[0]
        info_dicts['名称'].append(title)
        # 行政区
        descript = div.xpath('.//div[@class="list-info "]/p[1]/span[1]/text()')[0]
        info_dicts['行政区'].append(descript)
        # 详细地址
        mr5 = div.xpath('.//div[@class="list-info "]/p[1]/span[2]/text()')[0]
        info_dicts['位置'].append(mr5)
        # 面积
        area = div.xpath('.//div[@class="area"]/p/span[1]/text()')[0]
        info_dicts['面积(平方)'].append(''.join(area))
        # 租金
        price_monthly = div.xpath('.//div[@class="price "]/p[1]/span/text()')
        info_dicts['租金'].append(''.join(price_monthly)) if price_monthly else info_dicts['租金'].append('面议')
        # 日租金
        price_daily = div.xpath('.//div[@class="price "]/p[2]//text()')
        info_dicts['日租金'].append(price_daily[0].strip()) if price_daily else info_dicts['日租金'].append('面议')

    next = tree.xpath('//a[@class="next"]/@href')
    data = pd.DataFrame(info_dicts)
    if next:
        return data, next
    else:
        return data, False




# 主程序部分
data = pd.DataFrame()
url = 'https://bj.58.com/shangpucz/?from=zf&gposLastIndex=139&PGT'
while True:
    print("开始抓取链接: %s " % url)
    try:
        tree = get_html(url)
    except Exception as e:
        print("需要验证码了")
        time.sleep(20)
        continue
    an_data, next = get_data(tree)
    data = data.append(an_data, ignore_index=True)
    if next:
        url = next[0]
    else:
        break
    time.sleep(random.randint(15, 27))

# 持久化存储
data.to_excel('58爬虫.xls', index=False)

结果输出

在这里插入图片描述

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值