- 系统分析网页性质
- 结构化的数据解析
- csv数据保存
- python 3.8
- pycharm 专业版 >>> 激活码
- #模块使用
- requests >>> pip install requests
- parsel >>> pip install parsel
导入模块
import requests # 数据请求模块 第三方模块 pip install requests
import parsel # 数据解析模块
import re
import csv
发送请求,对房源列表发出请求
url = 'https://bj.lianjia.com/ershoufang/pg1/'
# 需要携带上 请求头: 把python代码伪装成浏览器 对于服务器发送请求
# User-Agent 浏览器的基本信息
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
获得数据
print(response.text)
解析数据
selector_1 = parsel.Selector(response.text)
# 把获取到response.text 数据内容转成 selector 对象
href = selector_1.css('div.leftContent li div.title a::attr(href)').getall()
for link in href:
html_data = requests.get(url=link, headers=headers).text
selector = parsel.Selector(html_data)
# css选择器 语法
# try:
title = selector.css('.title h1::text').get() # 标题
area = selector.css('.areaName .info a:nth-child(1)::text').get() # 区域
community_name = selector.css('.communityName .info::text').get() # 小区
room = selector.css('.room .mainInfo::text').get() # 户型
room_type = selector.css('.type .mainInfo::text').get() # 朝向
height = selector.css('.room .subInfo::text').get().split('/')[-1] # 楼层
# 中楼层/共5层 split('/') 进行字符串分割 ['中楼层', '共5层'] [-1]
# ['中楼层', '共5层'][-1] 列表索引位置取值 取列表中最后一个元素 共5层
# re.findall('共(\d+)层', 共5层) >>> [5][0] >>> 5
height = re.findall('共(\d+)层', height)[0]
sub_info = selector.css('.type .subInfo::text').get().split('/')[-1] # 装修
Elevator = selector.css('.content li:nth-child(12)::text').get() # 电梯
# if Elevator == '暂无数据电梯' or Elevator == None:
# Elevator = '无电梯'
house_area = selector.css('.content li:nth-child(3)::text').get().replace('㎡', '') # 面积
price = selector.css('.price .total::text').get() # 价格(万元)
date = selector.css('.area .subInfo::text').get().replace('年建', '') # 年份
dit = {
'标题': title,
'市区': area,
'小区': community_name,
'户型': room,
'朝向': room_type,
'楼层': height,
'装修情况': sub_info,
'电梯': Elevator,
'面积(㎡)': house_area,
'价格(万元)': price,
'年份': date,
}
csv_writer.writerow(dit)
print(title, area, community_name, room, room_type, height, sub_info, Elevator, house_area, price, date,
sep='|')
保存数据
f = open('二手房数据.csv', mode='a', encoding='utf-8', newline='')
csv_writer = csv.DictWriter(f, fieldnames=[
'标题',
'市区',
'小区',
'户型',
'朝向',
'楼层',
'装修情况',
'电梯',
'面积(㎡)',
'价格(万元)',
'年份',
])
csv_writer.writeheader()