python 58爬取商铺信息
个人接到一个58爬虫, 58数据都在html里面, 暂时没有太严重的反爬措施
技术交流:18611372505
封装获取 html方法
requests.packages.urllib3.disable_warnings()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
def get_html(url):
'''得到页面的etree对象'''
html_obj = requests.get(url, headers=headers, verify=False)
html_obj = html_obj.content.decode() # 解决乱码问题
tree = etree.HTML(html_obj) # 转化为页面的etree对象
return tree
使用 etree 解析html
def get_data(tree):
"""获取一页的房子数据"""
# 建立字典
info_dicts = defaultdict(list)
div_list = tree.xpath('//ul[@class="list-main-style"]/li')
for div in div_list:
# 名称
title = div.xpath('.//span[@class="title_des"]/text()')[0]
info_dicts['名称'].append(title)
# 行政区
descript = div.xpath('.//div[@class="list-info "]/p[1]/span[1]/text()')[0]
info_dicts['行政区'].append(descript)
# 详细地址
mr5 = div.xpath('.//div[@class="list-info "]/p[1]/span[2]/text()')[0]
info_dicts['位置'].append(mr5)
# 面积
area = div.xpath('.//div[@class="area"]/p/span[1]/text()')[0]
info_dicts['面积(平方)'].append(''.join(area))
# 租金
price_monthly = div.xpath('.//div[@class="price "]/p[1]/span/text()')
info_dicts['租金'].append(''.join(price_monthly)) if price_monthly else info_dicts['租金'].append('面议')
# 日租金
price_daily = div.xpath('.//div[@class="price "]/p[2]//text()')
info_dicts['日租金'].append(price_daily[0].strip()) if price_daily else info_dicts['日租金'].append('面议')
next = tree.xpath('//a[@class="next"]/@href')
data = pd.DataFrame(info_dicts)
if next:
return data, next
else:
return data, False
循环访问然后添加到pandas.DataFrame,然后导入xls文件
# 主程序部分
data = pd.DataFrame()
url = 'https://bj.58.com/shangpucz/?from=zf&gposLastIndex=139&PGT'
while True:
print("开始抓取链接: %s " % url)
try:
tree = get_html(url)
except Exception as e:
print("需要验证码了")
time.sleep(20)
continue
an_data, next = get_data(tree)
data = data.append(an_data, ignore_index=True)
if next:
url = next[0]
else:
break
time.sleep(random.randint(15, 27))
# 持久化存储
data.to_excel('58爬虫.xls', index=False)
完整代码
import random
from collections import defaultdict
import requests
import pandas as pd
from lxml import etree
import re
import time
requests.packages.urllib3.disable_warnings()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
def get_html(url):
'''得到页面的etree对象'''
html_obj = requests.get(url, headers=headers, verify=False)
html_obj = html_obj.content.decode() # 解决乱码问题
tree = etree.HTML(html_obj) # 转化为页面的etree对象
return tree
def get_data(tree):
"""获取一页的房子数据"""
# 建立字典
info_dicts = defaultdict(list)
div_list = tree.xpath('//ul[@class="list-main-style"]/li')
for div in div_list:
# 名称
title = div.xpath('.//span[@class="title_des"]/text()')[0]
info_dicts['名称'].append(title)
# 行政区
descript = div.xpath('.//div[@class="list-info "]/p[1]/span[1]/text()')[0]
info_dicts['行政区'].append(descript)
# 详细地址
mr5 = div.xpath('.//div[@class="list-info "]/p[1]/span[2]/text()')[0]
info_dicts['位置'].append(mr5)
# 面积
area = div.xpath('.//div[@class="area"]/p/span[1]/text()')[0]
info_dicts['面积(平方)'].append(''.join(area))
# 租金
price_monthly = div.xpath('.//div[@class="price "]/p[1]/span/text()')
info_dicts['租金'].append(''.join(price_monthly)) if price_monthly else info_dicts['租金'].append('面议')
# 日租金
price_daily = div.xpath('.//div[@class="price "]/p[2]//text()')
info_dicts['日租金'].append(price_daily[0].strip()) if price_daily else info_dicts['日租金'].append('面议')
next = tree.xpath('//a[@class="next"]/@href')
data = pd.DataFrame(info_dicts)
if next:
return data, next
else:
return data, False
# 主程序部分
data = pd.DataFrame()
url = 'https://bj.58.com/shangpucz/?from=zf&gposLastIndex=139&PGT'
while True:
print("开始抓取链接: %s " % url)
try:
tree = get_html(url)
except Exception as e:
print("需要验证码了")
time.sleep(20)
continue
an_data, next = get_data(tree)
data = data.append(an_data, ignore_index=True)
if next:
url = next[0]
else:
break
time.sleep(random.randint(15, 27))
# 持久化存储
data.to_excel('58爬虫.xls', index=False)