python 抓取58商铺信息

最新推荐文章于 2024-06-02 21:08:37 发布

qq_1548357515

最新推荐文章于 2024-06-02 21:08:37 发布

阅读量509

点赞数

分类专栏： python 文章标签： python xpath html

本文链接：https://blog.csdn.net/qq_33239778/article/details/113823911

版权

python 专栏收录该内容

22 篇文章

订阅专栏

python 58爬取商铺信息

个人接到一个58爬虫， 58数据都在html里面，暂时没有太严重的反爬措施
技术交流：18611372505
在这里插入图片描述

封装获取 html方法

requests.packages.urllib3.disable_warnings()
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}


def get_html(url):
    '''得到页面的etree对象'''
    html_obj = requests.get(url, headers=headers, verify=False)
    html_obj = html_obj.content.decode()  # 解决乱码问题
    tree = etree.HTML(html_obj)  # 转化为页面的etree对象
    return tree

使用 etree 解析html

def get_data(tree):
    """获取一页的房子数据"""
    # 建立字典
    info_dicts = defaultdict(list)
    div_list = tree.xpath('//ul[@class="list-main-style"]/li')
    for div in div_list:
        # 名称
        title = div.xpath('.//span[@class="title_des"]/text()')[0]
        info_dicts['名称'].append(title)
        # 行政区
        descript = div.xpath('.//div[@class="list-info "]/p[1]/span[1]/text()')[0]
        info_dicts['行政区'].append(descript)
        # 详细地址
        mr5 = div.xpath('.//div[@class="list-info "]/p[1]/span[2]/text()')[0]
        info_dicts['位置'].append(mr5)
        # 面积
        area = div.xpath('.//div[@class="area"]/p/span[1]/text()')[0]
        info_dicts['面积(平方)'].append(''.join(area))
        # 租金
        price_monthly = div.xpath('.//div[@class="price "]/p[1]/span/text()')
        info_dicts['租金'].append(''.join(price_monthly)) if price_monthly else info_dicts['租金'].append('面议')
        # 日租金
        price_daily = div.xpath('.//div[@class="price "]/p[2]//text()')
        info_dicts['日租金'].append(price_daily[0].strip()) if price_daily else info_dicts['日租金'].append('面议')

    next = tree.xpath('//a[@class="next"]/@href')
    data = pd.DataFrame(info_dicts)
    if next:
        return data, next
    else:
        return data, False

循环访问然后添加到pandas.DataFrame，然后导入xls文件

# 主程序部分
data = pd.DataFrame()
url = 'https://bj.58.com/shangpucz/?from=zf&amp;gposLastIndex=139&amp;PGT'
while True:
    print("开始抓取链接: %s " % url)
    try:
        tree = get_html(url)
    except Exception as e:
        print("需要验证码了")
        time.sleep(20)
        continue
    an_data, next = get_data(tree)
    data = data.append(an_data, ignore_index=True)
    if next:
        url = next[0]
    else:
        break
    time.sleep(random.randint(15, 27))

# 持久化存储
data.to_excel('58爬虫.xls', index=False)

完整代码

import random
from collections import defaultdict
import requests
import pandas as pd
from lxml import etree
import re
import time

requests.packages.urllib3.disable_warnings()
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}


def get_html(url):
    '''得到页面的etree对象'''
    html_obj = requests.get(url, headers=headers, verify=False)
    html_obj = html_obj.content.decode()  # 解决乱码问题
    tree = etree.HTML(html_obj)  # 转化为页面的etree对象
    return tree


def get_data(tree):
    """获取一页的房子数据"""
    # 建立字典
    info_dicts = defaultdict(list)
    div_list = tree.xpath('//ul[@class="list-main-style"]/li')
    for div in div_list:
        # 名称
        title = div.xpath('.//span[@class="title_des"]/text()')[0]
        info_dicts['名称'].append(title)
        # 行政区
        descript = div.xpath('.//div[@class="list-info "]/p[1]/span[1]/text()')[0]
        info_dicts['行政区'].append(descript)
        # 详细地址
        mr5 = div.xpath('.//div[@class="list-info "]/p[1]/span[2]/text()')[0]
        info_dicts['位置'].append(mr5)
        # 面积
        area = div.xpath('.//div[@class="area"]/p/span[1]/text()')[0]
        info_dicts['面积(平方)'].append(''.join(area))
        # 租金
        price_monthly = div.xpath('.//div[@class="price "]/p[1]/span/text()')
        info_dicts['租金'].append(''.join(price_monthly)) if price_monthly else info_dicts['租金'].append('面议')
        # 日租金
        price_daily = div.xpath('.//div[@class="price "]/p[2]//text()')
        info_dicts['日租金'].append(price_daily[0].strip()) if price_daily else info_dicts['日租金'].append('面议')

    next = tree.xpath('//a[@class="next"]/@href')
    data = pd.DataFrame(info_dicts)
    if next:
        return data, next
    else:
        return data, False




# 主程序部分
data = pd.DataFrame()
url = 'https://bj.58.com/shangpucz/?from=zf&amp;gposLastIndex=139&amp;PGT'
while True:
    print("开始抓取链接: %s " % url)
    try:
        tree = get_html(url)
    except Exception as e:
        print("需要验证码了")
        time.sleep(20)
        continue
    an_data, next = get_data(tree)
    data = data.append(an_data, ignore_index=True)
    if next:
        url = next[0]
    else:
        break
    time.sleep(random.randint(15, 27))

# 持久化存储
data.to_excel('58爬虫.xls', index=False)

结果输出

在这里插入图片描述