创建项目名令:
首先打开一个文件夹,在pycharm里面打开
scrapy startproject lianjiaspiders
使用名令 cd lianjiaspiders
在lianjiaspiders 里面创建一个项目:
scrapy genspider lianjia https://www.lianjia.com/city/
创建完成后显示:
写爬虫代码在sqiders里面的lianjia.py里面完成
-- coding: utf-8 --
import scrapy
import re, requests
from lianjiaSpider.items import LianjiaspiderItem
from fake_useragent import UserAgent
class LianjiaSpider(scrapy.Spider):
name = 'lianjia'
# allowed_domains = ['bj.lianjia.com']
# 定义爬虫起始url
start_urls = ["https://www.lianjia.com/city/"]
def parse(self, response):
# 获取所有城市的url
city_url_list = response.xpath("//div[@class='city_list']//ul/li/a/@href").extract()
# 循环遍历拼接完整的url
for url in city_url_list:
city_url = url + "zufang"
yield scrapy.Request(url=city_url, callback=self.business_parse)
def business_parse(self, response):
# 获取商圈url列表
business_url_list = response.xpath("//ul[@data-target='area']/li[position()>1]/a/@href").extract()
# print(business_url_list)
for url in business_url_list:
business_url = "https://bj.lianjia.com" + url
# print(business_url)
yield scrapy.Request(url=business_url, callback=self.parse_page_url, meta={"data": business_url})
def parse_page_url(self, response):
# print(response.url)
# 缩小范围
div_list = response.xpath("//div[@class='content__list']/div")
for div in div_list:
# 图片
pic = div.xpath(".//img/@data-src").extract()[0]
pic = pic.replace("250x182", "2