1.items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class LianjiaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pic = scrapy.Field()
title = scrapy.Field()
detail_url = scrapy.Field()
price = scrapy.Field()
publish_info = scrapy.Field()
pic_list = scrapy.Field()
house_code = scrapy.Field()
ucid = scrapy.Field()
agent_name = scrapy.Field()
agent_phone = scrapy.Field()
2.lianjia.py
# -*- coding: utf-8 -*-
import scrapy
from LianJia.items import LianjiaItem
import re
import json
import requests
class LianjiaSpider(scrapy.Spider):
name = 'lianjia'
allowed_domains = ['lianjia.com']
start_urls = ['https://www.lianjia.com/city/']
def parse(self, response):
# 获取到的是新房的url
city_url_list = response.xpath("//div[@class='city_province']//li/a/@href").extract()
# print(city_url_list)
city_name_list = response.xpath("//div[@class='city_province']//li/a/text()").extract()
for index in range(len(city_url_list)):
city_name = city_name_list[index]
city_url = city_url_list[index]
# print(city_url)
# 城市首字母
city_alp = re.findall(r"https://(\w*).", city_url)[0]
# print(city_alp)
# 拼接租房城市url
city_url = "https://" + city_alp + ".lianjia.com/zufang/"
# print("--------------------{}开始下载-------------------------------".format(city_name))
yield scrapy.Request(url=city_url,