主要的程序
import scrapy
from ..items import LianjiatestchenItem
import json
class LianjiachenSpider(scrapy.Spider):
name = 'Lianjiachen'
# allowed_domains = ['bj.lianjia.com/zufang']
start_urls = ['http://bj.lianjia.com/zufang/']
def parse(self, response, **kwargs):
# 获取城区
area_bj_list = response.xpath('//*[@id="filter"]/ul[2]//li[position()>1]//a/@href').extract()
# 拼接url
for i in area_bj_list:
real_url = "https://bj.lianjia.com" + i
# print(real_url)
yield scrapy.Request(url=real_url, callback=self.parse_page_url, dont_filter=True)
pass
def parse_page_url(self, response):
# 获取最大页数
max_page_lianjia = response.xpath("""//*[@id="content"]/div[1]/div[2]/@data-totalpage""").extract()
# print(max_page_lianjia)
for i in range(1, int(max_page_lianjia[0]) + 1):
# 拼接页数链接
url = response.url + "pg" + str(i)
# print(url)
yield scrapy.Request(url=url, callback=self.paese_message, dont_filter=True)
pass
def paese_message(self, response):
# 获取所有的div
# print(response.url)
all_div = response.xpath("""//*[@id="content"]/div[1]/div[1]//div""")
for house in all_div:
name_type_orientation = house.xpath(""".//p[@class='content__list--item--title']/a/text()""").extract()[
0].strip().split(' ')
# 房子名字
name = name_type_orientation[0]
# 房子户型
house_type = name_type_orientation[1]
# 房子朝向
orientation = name_type_orientation[2]
# 房子区域
area = house.xpath(""".//p[@class='content__list--item--des']/a/text()""").extract()[0]
# print(area)
# 房子街道
street = house.xpath(""".//p[@class='content__list--item--des']/a[2]/text()""").extract()[0]
# print(street)
# 房子具体位置
concrete = house.xpath(""".//p[@class='content__list--item--des']/a[3]/text()""").extract()[0]
# 房子租金
lease = house.xpath(""".//span/em/text()""").extract()[0]
# print(lease)
# characteristic 特点
characteristic = house.xpath(""".//p[3]//i/text()""").extract()
characteristic = ''.join([i + '-' for i in characteristic]) if characteristic else '空的'
characteristic = characteristic.rstrip('-')
# print(characteristic)
# 维护时间
maintenance_time = house.xpath(""".//p[4]/span[2]/text()""").extract()[0]
# print(maintenance_time)
item = LianjiatestchenItem()
item['name'] = name
item['house_type'] = house_type
item['orientation'] = orientation
item['street'] = street
item['area'] = area
item['concrete'] = concrete
item['lease'] = lease
item['characteristic'] = characteristic
item['maintenance_time'] = maintenance_time
# 获取详情页地址
detail_url = house.xpath(""".//p[@class='content__list--item--title']/a/@href""").extract()[0]
detail_url = 'https://bj.lianjia.com' + detail_url
# print(detail_url)
yield scrapy.Request(url=detail_url, callback=self.detail_page, meta={'item': item})
pass
def detail_page(self, response):
item = response.meta['item']
loupan = response.xpath("""//*[@id="aside"]/ul/li[3]/span[2]/text()""").extract()[0]
loupan = loupan.split(' ')[1].split('/')
floor_properties = loupan[0]
floor_num = loupan[1]
item['floor_properties'] = floor_properties
item['floor_num'] = floor_num
phone_url = 'https://ex.lianjia.com/sdk/phone400'
# 手机参数data
data_dict = response.xpath("""//*[@id="aside"]/div[2]/div[1]/@data-agent""").extract()[0]
data_dict = json.loads(data_dict)
ucId = data_dict['ucId']
digV = data_dict['digV']
# print(digV)
adId = json.loads(digV)['adId']
data = {
"adId": str(adId),
"digV": str(digV),
"hdicCityId": "110000",
"mediumId": "100000032",
"mobileType": "AGENT",
"required400": "true",
"ucId": str(ucId)
}
yield scrapy.Request(url=phone_url, callback=self.phone_num, method='post', body=json.dumps(data),
headers={"Content-Type": "application/json"}, meta={'item': item})
pass
def phone_num(self, response):
item = response.meta['item']
phone = json.loads(response.text)
phone_num = phone['data'][0]['phone400']
item['phone_num'] = phone_num
# print(phone)
print(phone_num)
yield item
pass
管道
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql
class LianjiaSpiderPipeline:
def __init__(self):
self.conn_mysql()
def conn_mysql(self):
self.db = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='123456',
database='test_chen_Lianjia', charset='utf8')
self.cursor = self.db.cursor()
pass
def process_item(self, item, spider):
pic = item["pic"]
title = item["title"]
city_area = item["city_area"]
business_area = item["business_area"]
road_area = item["road_area"]
toward = item["toward"]
area = item["area"]
room = item["room"]
hall = item["hall"]
toliet = item["toliet"]
sign_list = item["sign_list"]
publish_time = item["publish_time"]
lese = item["lese"]
phone = item["phone"]
floor_type = item["floor_type"]
floor_num = item["floor_num"]
sql = """insert into linajia_table (pic,title,city_area,business_area,road_area,toward,area,room,hall,toliet,sign_list,publish_time,lese,phone,floor_type,floor_num)
values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
try:
# 执行sql语句
self.cursor.execute(sql, (
pic, title, city_area, business_area, road_area, toward, area, room, hall, toliet, sign_list, publish_time,
lese, phone, floor_type, floor_num))
# 提交到数据执行
self.db.commit()
except Exception as e:
print(e)
# 回滚 要么执行,要么不执行
self.db.rollback()
return item
ITEM
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class LianjiatestchenItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
house_type = scrapy.Field()
orientation = scrapy.Field()
street = scrapy.Field()
area = scrapy.Field()
concrete = scrapy.Field()
lease = scrapy.Field()
characteristic = scrapy.Field()
maintenance_time = scrapy.Field()
floor_properties = scrapy.Field()
floor_num = scrapy.Field()
phone_num = scrapy.Field()
pass
1111