使用scrapy框架爬虫,写入到数据库

  • 安装框架:pip install scrapy

  • 在自定义目录下,新建一个Scrapy项目
    scrapy startproject 项目名
    创建一个爬虫项目

  • 编写spiders爬取网页
    scrapy genspider 爬虫名称 “爬取域”
    在这里插入图片描述
    在这里插入图片描述

  • 编写实体类
    打开pycharm,编辑项目中items.py

import scrapy


class BossItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()     #对应实体类的属性
    salary = scrapy.Field()
    
  • 编写爬虫
# -*- coding: utf-8 -*-
import scrapy
from Boss.items import BossItem

class ZhipinSpider(scrapy.Spider):
    name = 'zhipin'    #爬虫名称
    allowed_domains = ['lianjia.com']     #爬取域
    start_urls = ['https://sh.lianjia.com/zufang/']   #爬取链接

    def parse(self, response):
        items = []
        posts = response.xpath("//div[@class='content__list--item--main']")
        for each in posts:
            item = BossItem()
            item["name"] = each.xpath("//p[@class='content__list--item--title twoline']/a/text()").extract()[0]
            address = each.xpath("p[class='content__list--item--des']/a[position()<4]/text()").extract()

            item["salary"] = each.xpath("span[@class='content__list--item-price']/em/text()").extract()[0]
            #item["address"] = address[1]+address[2]+address[3]
            print(item)
            items.append(item)
            # yield item
        return items

        #测试是否能爬取到网页
        # with open("lianjia.html","w",encoding="utf-8") as file:
        #     file.write(response.text)



  • 设置管道属性,在settings.py中添加以下代码,启动管道
ITEM_PIPELINES = {
   'Boss.pipelines.BossPipeline': 300,     #项目名.管道名.管道类  :时间越小优先级越高(0-1000)
}
#浏览器代理
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
  • 编写管道类(用于输出文件)
import json


class BossPipeline(object):
    def __init__(self):
        self.file = open("lianjia.json","w",encoding="utf-8")
    def process_item(self, item, spider):
        content = json.dumps(dict(item),ensure_ascii = False)+"\n"
        self.file.write(content)
        return item
        
    def close_spider(self,spider):
        self.file.close()

  • 运行爬虫项目
    scrapy crawl 爬虫名称
  • 也可编写启动类,在爬虫项目下,创建一个新的py文件,
from scrapy import cmdline
cmdline.execute("scrapy crawl zhipin".split())

运行结束后,会出现一个json的文件。
运行结果
创建数据库表python

import json

import pymysql
import traceback
from time import sleep


class PyMySQL(object):
    create_table = 'create table lianjia(id int not null primary key auto_increment,name varchar(255) not null,salary int,address varchar(255))default charset=utf8'


    def __init__(self, host, user, pwd, db):
        self.conn = pymysql.connect(host, user, pwd, db)
        self.cursor = self.conn.cursor()

    def create_table_func(self):
        self.cursor.execute(PyMySQL.create_table)
        print('数据表创建完毕')

    def insert_date(self,sql):
        try:
            self.cursor.execute(sql)
            self.conn.commit()
        except:
            print(traceback.format_exc())
            self.conn.rollback()


    def select_data(self):
        self.cursor.execute(PyMySQL.select)

        all_data = self.cursor.fetchall()
        for i in all_data:
            print('查询结果为:{}'.format(i))


if __name__ == '__main__':
    my = PyMySQL('localhost', 'root', '123456', 'pytest')
    # my.create_table_func()
    with open('../lianjia.json','r',encoding='utf-8') as f:
        for line in f.readlines():
            print(line)
            temp = json.loads(line)
            name = temp['name'].strip();
            salary = temp['salary']
            address = temp['address']
            sql = 'insert into lianjia(name,salary,address) values("%s","%s","%s")' % (name, salary, address)
            my.insert_date(sql)


执行此方法将数据写到数据据库中。
相关问题

  1. DEBUG: Forbidden by robots.txt
  2. DEBUG: Crawled (403)
  3. DEBUG: Redirecting (302) to
  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值