第一个Scrapy爬虫程序

# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from lianjiawang.items import LianjiawangItem
class LianjiaSpider(scrapy.Spider):
    name = 'lianjia'

    start_urls = ['https://nj.lianjia.com/ershoufang/gulou/']

    def parse(self, response):
        bs = BeautifulSoup(response.body, 'html.parser')
        tag_list = bs.find('ul', {"class": "sellListContent"}).children  # 是个列表
        homeitem = LianjiawangItem()
        for tag in tag_list:
            # 提取title
            title = tag.find('div', {"class": "title"}).get_text().strip()
            # print(tag.name)#输出的内容是30个 li
            print(title)
            # 提取小区名称
            community_name = tag.find('div', {"class": "address"}).find('div', {"class": "houseInfo"}).find(
                'a').get_text().strip()
            print(community_name)
            # 提取户型、房屋面积、朝向、装修情况
            str = tag.find('div', {"class": "address"}).find('div', {"class": "houseInfo"}).get_text()
            str = str.split('|')
            house_type = str[1].strip()
            print(house_type)
            area = str[2].strip()
            print(area)
            face = str[3].strip()
            print(face)
            decoration = str[4].strip()
            print(decoration)
            # 所在楼层,高度,建筑年限
            info = tag.find('div', {"class": "flood"}).find('div', {"class": "positionInfo"}).get_text()
            louceng = info[0:3].strip()
            heigh = info[4:7].strip()
            cons_time = info[8:13].strip()
            print(louceng)
            print(heigh)
            print(cons_time)
            # 提取小区位置
            position = tag.find('div', {"class": "flood"}).find('div', {"class": "positionInfo"}).find(
                'a').get_text().strip()
            print(position)
            # 提取关注人数,房子发布时间
            info = tag.find('div', {"class": "followInfo"}).get_text().strip()
            info = info.split('/')
            focus_num = info[0].strip()
            pu_time = info[1].strip()
            print(focus_num)
            print(pu_time)
            # 提取总价、单价
            total_price = tag.find('div', {"class": "priceInfo"}).find('div', {"class": "totalPrice"}).find(
                'span').get_text().strip()
            print(total_price)
            unit_price = tag.find('div', {"class": "priceInfo"}).find('div', {"class": "unitPrice"}).find(
                'span').get_text()
            unit_price = unit_price.replace("单价", "").replace("元/平米", "").strip()
            print(unit_price)

            # 将信息添加到item
            homeitem["title"] = title
            homeitem["community_name"] = community_name
            homeitem["house_type"] = house_type
            homeitem["area"] = area
            homeitem["face"] = face
            homeitem["decoration"] = decoration
            homeitem["louceng"] = louceng
            homeitem["heigh"] = heigh
            homeitem["cons_time"] = cons_time
            homeitem["position"] = position
            homeitem["focus_num"] = focus_num
            homeitem["pu_time"] = pu_time
            homeitem["total_price"] = total_price
            homeitem["unit_price"] = unit_price
            yield homeitem


在导入item类时遇到一个问题
from lianjiawang.items import LianjiawangItem,这句代码会报错
解决办法
图片描述
items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy
from scrapy.item import Item, Field

class LianjiawangItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = Field()
    community_name = Field()
    house_type = Field()
    area = Field()
    face = Field()
    decoration = Field()
    louceng = Field()
    heigh = Field()
    cons_time = Field()
    position = Field()
    focus_num = Field()
    pu_time = Field()
    total_price = Field()
    unit_price = Field()

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import csv

class LianjiawangPipeline(object):
    def process_item(self, item, spider):
        data = [
            item["title"],
            item["community_name"],
            item["total_price"],
            item["unit_price"],
            item["house_type"],
            item["area"],
            item["face"],
            item["decoration"],
            item["louceng"],
            item["heigh"],
            item["position"],
            item["cons_time"],
            item["focus_num"],
            item["pu_time"],

        ]

        with open("lianjia_1.csv", "a", newline="")as fi:
            writer = csv.writer(fi, dialect='excel')
            writer.writerow(data)
        return item

附完整工程代码链接
链接: https://pan.baidu.com/s/1DZbTpWboxXDuFgDhG6qTzQ 提取码: g612

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值