创建项目名令:
首先打开一个文件夹,在pycharm里面打开
scrapy startproject lianjiaspiders
使用名令 cd lianjiaspiders
在lianjiaspiders 里面创建一个项目:
scrapy genspider lianjia https://www.lianjia.com/city/
创建完成后显示:
写爬虫代码在sqiders里面的lianjia.py里面完成
-- coding: utf-8 --
import scrapy
import re, requests
from lianjiaSpider.items import LianjiaspiderItem
from fake_useragent import UserAgent
class LianjiaSpider(scrapy.Spider):
name = 'lianjia'
# allowed_domains = ['bj.lianjia.com']
# 定义爬虫起始url
start_urls = ["https://www.lianjia.com/city/"]
def parse(self, response):
# 获取所有城市的url
city_url_list = response.xpath("//div[@class='city_list']//ul/li/a/@href").extract()
# 循环遍历拼接完整的url
for url in city_url_list:
city_url = url + "zufang"
yield scrapy.Request(url=city_url, callback=self.business_parse)
def business_parse(self, response):
# 获取商圈url列表
business_url_list = response.xpath("//ul[@data-target='area']/li[position()>1]/a/@href").extract()
# print(business_url_list)
for url in business_url_list:
business_url = "https://bj.lianjia.com" + url
# print(business_url)
yield scrapy.Request(url=business_url, callback=self.parse_page_url, meta={"data": business_url})
def parse_page_url(self, response):
# print(response.url)
# 缩小范围
div_list = response.xpath("//div[@class='content__list']/div")
for div in div_list:
# 图片
pic = div.xpath(".//img/@data-src").extract()[0]
pic = pic.replace("250x182", "2500x1800")
# print(pic)
# 标题
title = div.xpath(".//p[@class='content__list--item--title twoline']/a/text()").extract()[0].strip()
# print(title)
# 城区
city_area = div.xpath(".//p[@class='content__list--item--des']/a[1]/text()").extract()[0]
# 商圈
business_circle = div.xpath(".//p[@class='content__list--item--des']/a[2]/text()").extract()[0]
# print(city_area, business_circle)
# 面积
area = div.xpath(".//p[@class='content__list--item--des']//text()[4]").extract()
area = area[0].strip() if area else "" # 空值处理
# print(area)
# 朝向
toward = div.xpath(".//p[@class='content__list--item--des']//text()[5]").extract()[0].strip()
# print(toward)
# 房间信息
fang_info = div.xpath(".//p[@class='content__list--item--des']//text()[6]").extract()[0].strip()
# print(fang_info)
room = re.findall("(\d+)室", fang_info) # 室
hall = re.findall("(\d+)厅", fang_info) # 厅
toilet = re.findall("(\d+)卫", fang_info) # 卫
# 空值处理
room = int(room[0]) if room else 0
hall = int(hall[0]) if hall else 0
toliet = int(toilet[0]) if toilet else 0
# print(room, hall, toilet)
# 发布时间
publish_date = div.xpath(".//p[@class='content__list--item--time oneline']/text()").extract()[0]
# print(publish_date)
# 标签
sign_list = div.xpath(".//p[@class='content__list--item--bottom oneline']/i/text()").extract()
# print(sign_list)
# 将标签转换为字符串
sign = "-".join(sign_list)
# print(sign)
# 价格
price = div.xpath(".//em/text()").extract()[0]
# print(price)
# 详情url
detail_url = div.xpath(".//p[@class='content__list--item--title twoline']/a/@href").extract()[0]
# 拼接完整的详情url
detail_url = "https://bj.lianjia.com" + detail_url
# 实例化item类 将字段存入item
item = LianjiaspiderItem()
item["room"] = room
item["hall"] = hall
item["toliet"] = toliet
item["pic"] = pic
item["title"] = title
item["city_area"] = city_area
item["business_circle"] = business_circle
item["area"] = area
item["toward"] = toward
item["sign"] = sign
item["price"] = price
item["publish_date"] = publish_date
item["detail_url"] = detail_url
# print(item)
# 进一步抓取详情页的
yield scrapy.Request(url=detail_url, meta={"data": item}, callback=self.parse_detail, dont_filter=True)
# 获取下一页的url
max_page = response.xpath("//div[@class='content__pg']/@data-totalpage").extract()
max_page = int(max_page[0]) if max_page else 1
# print(max_page)
for page in range(1, max_page):
# 拼接完整的下一页url
business_url = response.meta["data"]
page_url = business_url + "pg{}/".format(page)
# print(page_url)
# 循环的调用自己
yield scrapy.Request(url=page_url, callback=self.parse)
def parse_detail(self, response):
# 接受item参数
item = response.meta["data"]
# 获取去楼层
floor = response.xpath("//ul/li[@class='fl oneline'][8]/text()").extract()
floor = floor[0] if floor else ""
# 获取电话号码,由于电话号码在借口里,要首先获取ucid和house_code
# 1.获取ucid,封装到一个函数里
ucid = self.get_ucid(response)
# 2.获取house_code
house_code = re.findall("zufang/(.*?).html", response.url)[0]
# 拼接完整的经纪人接口
agent_url = "https://bj.lianjia.com/zufang/aj/house/brokers?house_codes={}&position=bottom&ucid={}".format(
house_code, ucid)
try:
# 获取接口中的信息
headers = {"User-Agent": UserAgent().random}
json_data = requests.get(agent_url, headers=headers).json()
# print(json_data)
phone = json_data.get("data")[house_code][house_code].get("tp_number")
# print(phone)
except Exception as e:
print(e)
phone = ''
# 把电话号码和楼层信息放到item里
item["phone"] = phone
item["floor"] = floor
# print(item)
yield item
# 获取ucid函数
def get_ucid(self, response):
count_ucid = 1
try:
ucid = response.xpath("//span[@class='contact__im im__online']/@data-info").extract()
ucid = ucid[0] if ucid else ""
# print(ucid)
return ucid
except Exception as e:
print(e)
if count_ucid == 3:
return ""
else:
count_ucid += 1
return self.get_ucid(response)
items里面的代码,这个里面的代码相当于定义,后面的参数必须和定义的相同
import scrapy
class LianjiaspiderItem(scrapy.Item):
# define the fields for your item here like:
room = scrapy.Field()#室
hall = scrapy.Field()#厅
toliet = scrapy.Field()#卫
pic = scrapy.Field()#图片
title = scrapy.Field()#标题
city_area = scrapy.Field()#城市
business_circle = scrapy.Field()#商圈
area = scrapy.Field()#面积
toward = scrapy.Field()#朝向
sign = scrapy.Field()#标签
price = scrapy.Field()#价格
detail_url = scrapy.Field()#详情的url
publish_date = scrapy.Field()#发布时间
floor = scrapy.Field()#楼层
phone = scrapy.Field()#电话
配置里面需要完成的:
pipelines管道里面需要完成的代码。包括写入数据库:
import pymysql
class LianjiaspiderPipeline(object):
def __init__(self):
self.count = 1
self.conn_mysql()
def conn_mysql(self):
self.conn = pymysql.connect(host="127.0.0.1", user="root", password='123',
database="0218", charset="utf8")
# 创建操作数据库的对象
self.cur = self.conn.cursor()
def process_item(self, item, spider):
# print(item)
#把数据从item中拿出来
room = item["room"]
hall = item["hall"]
toliet = item["toliet"]
pic = item["pic"]
title = item["title"]
city_area = item["city_area"]
business_circle = item["business_circle"]
area = item["area"]
toward = item["toward"]
sign = item["sign"]
price = item["price"]
detail_url = item["detail_url"]
floor = item["floor"]
phone = item["phone"]
publish_date = item["publish_date"]
sql = 'insert into lianjia_copy(room,hall,toliet,pic,title,city_area,business_circle,area,toward,publish_date,sign,price,detail_url,floor,phone)VALUES({},{},{},"{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")'.\
format(room, hall, toliet, pic, title, city_area, business_circle, area, toward,publish_date, sign, price, detail_url, floor, phone)
# self.cur.execute(sql)
# self.conn.commit()
print(self.count,sql)
self.count+=1
return item
运行的名令 在main里面
from scrapy import cmdline
# cmdline.execute("scrapy crawl sina".split())
cmdline.execute("scrapy crawl sina --nolog".split())