# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from lianjiawang.items import LianjiawangItem
class LianjiaSpider(scrapy.Spider):
name = 'lianjia'
start_urls = ['https://nj.lianjia.com/ershoufang/gulou/']
def parse(self, response):
bs = BeautifulSoup(response.body, 'html.parser')
tag_list = bs.find('ul', {"class": "sellListContent"}).children # 是个列表
homeitem = LianjiawangItem()
for tag in tag_list:
# 提取title
title = tag.find('div', {"class": "title"}).get_text().strip()
# print(tag.name)#输出的内容是30个 li
print(title)
# 提取小区名称
community_name = tag.find('div', {"class": "address"}).find('div', {"class": "houseInfo"}).find(
'a').get_text().strip()
print(community_name)
# 提取户型、房屋面积、朝向、装修情况
str = tag.find('div', {"class": "address"}).find('div', {"class": "houseInfo"}).get_text()
str = str.split('|')
house_type = str[1].strip()
print(house_type)
area = str[2].strip()
print(area)
face = str[3].strip()
print(face)
decoration = str[4].strip()
print(decoration)
# 所在楼层,高度,建筑年限
info = tag.find('div', {"class": "flood"}).find('div', {"class": "positionInfo"}).get_text()
louceng = info[0:3].strip()
heigh = info[4:7].strip()
cons_time = info[8:13].strip()
print(louceng)
print(heigh)
print(cons_time)
# 提取小区位置
position = tag.find('div', {"class": "flood"}).find('div', {"class": "positionInfo"}).find(
'a').get_text().strip()
print(position)
# 提取关注人数,房子发布时间
info = tag.find('div', {"class": "followInfo"}).get_text().strip()
info = info.split('/')
focus_num = info[0].strip()
pu_time = info[1].strip()
print(focus_num)
print(pu_time)
# 提取总价、单价
total_price = tag.find('div', {"class": "priceInfo"}).find('div', {"class": "totalPrice"}).find(
'span').get_text().strip()
print(total_price)
unit_price = tag.find('div', {"class": "priceInfo"}).find('div', {"class": "unitPrice"}).find(
'span').get_text()
unit_price = unit_price.replace("单价", "").replace("元/平米", "").strip()
print(unit_price)
# 将信息添加到item
homeitem["title"] = title
homeitem["community_name"] = community_name
homeitem["house_type"] = house_type
homeitem["area"] = area
homeitem["face"] = face
homeitem["decoration"] = decoration
homeitem["louceng"] = louceng
homeitem["heigh"] = heigh
homeitem["cons_time"] = cons_time
homeitem["position"] = position
homeitem["focus_num"] = focus_num
homeitem["pu_time"] = pu_time
homeitem["total_price"] = total_price
homeitem["unit_price"] = unit_price
yield homeitem
在导入item类时遇到一个问题
from lianjiawang.items import LianjiawangItem,这句代码会报错
解决办法
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy.item import Item, Field
class LianjiawangItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = Field()
community_name = Field()
house_type = Field()
area = Field()
face = Field()
decoration = Field()
louceng = Field()
heigh = Field()
cons_time = Field()
position = Field()
focus_num = Field()
pu_time = Field()
total_price = Field()
unit_price = Field()
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import csv
class LianjiawangPipeline(object):
def process_item(self, item, spider):
data = [
item["title"],
item["community_name"],
item["total_price"],
item["unit_price"],
item["house_type"],
item["area"],
item["face"],
item["decoration"],
item["louceng"],
item["heigh"],
item["position"],
item["cons_time"],
item["focus_num"],
item["pu_time"],
]
with open("lianjia_1.csv", "a", newline="")as fi:
writer = csv.writer(fi, dialect='excel')
writer.writerow(data)
return item
附完整工程代码链接
链接: https://pan.baidu.com/s/1DZbTpWboxXDuFgDhG6qTzQ 提取码: g612