Scrapy 使用记录

最新推荐文章于 2022-08-17 17:11:16 发布

坚持做笔记

最新推荐文章于 2022-08-17 17:11:16 发布

阅读量423

点赞数

分类专栏： python 文章标签： Scrapy 文档教程

本文链接：https://blog.csdn.net/u011562187/article/details/75128574

版权

python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

官方文档链接

1.items定义

class HistoryItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pid = scrapy.Field()  # 项目id
    name = scrapy.Field()  # 项目


class ForecastItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pid = scrapy.Field()  # 项目id
    name = scrapy.Field()  # 项目

2.文件编码

# coding=utf-8
# 文件开头

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

3.spider 写法演示(对原文件进行了修改删减，仅做演示)

# coding=utf-8
import scrapy
import sys
import cgi

from wanglu.items import ForecastItem, SupplyItem

reload(sys)
sys.setdefaultencoding('utf-8')

# scrapy crawl wang 命令
class WangSpider(scrapy.Spider):
    name = "wang"
    base_url= "http://www.sample.com/info?id=" #自己的网址
    start_urls =[]

    #从文件seedall.txt读id参数，每行一个
    # def __init__(self):
    #     ids = open("seedall.txt","r").readlines()
    #     for did in ids:
    #         self.start_urls.append(self.base_url+did.replace("\n", "").strip())

    #循环 id
    def __init__(self):
        for did in range(22312,23475):
            self.start_urls.append(self.base_url+str(did))

    def parse(self,response):
        try:
            time.sleep(1)
        except BaseException as e:
            e.__str__()

        pid = response.url.split("=")[-1]
        table = response.xpath("/html/body/div[@id='basicInfoDiv']/div/table[@class='jbxx_table'][1]")
        item = WangluItem()
        item["pid"]=response.url.split("=")[-1]
        item['company_name']=table.xpath(".//td/label[@id='lblcname']/text()")[0].extract().strip().replace("\r\n", "")


        if item["company_name"].strip() == '':
            return

        yield item 

        forecastTable =response.css("#businessPlanDiv div > table")[1]
        if forecastTable and forecastTable.css("tr"):
            for i,tr in enumerate(forecastTable.css("tr")):
                if i > 1:
                    forecast = Forecast2Item()
                    forecast["pid"] = pid
                    forecast['year'] = cgi.escape( tr.xpath(".//td[1]/text()")[0].extract().strip().replace("\r\n", ""),True)
                    forecast['edu_bg'] = cgi.escape(tr.xpath('.//td[1]')[0].extract().strip().replace("\r\n", ""),True)
                    yield forecast

选择器文档链接
3.1 xpath 浏览器工具可以查看，下图为360浏览器获取xpath

这里写图片描述

但是这样获取的并不一定有效,而且可能很长，经常需要根据自己的情况修改xpath.
tbody 是一个小坑，因为源返回值中可能没有tbody ，但是浏览器会自动加上tbody, 造成路径问题，要注意。
3.2 css 选择器， “>” 直接子元素 ,”:nth-child()” 我较常用

注意 :xpath和css方法返回的都是”数组”
cgi.escape(str,True) 将html特殊符号和引号转义，在保存数据到数据库是很有用

取文本：

tr.xpath(".//td/text()")    
tr.css("td:: text")

取元素 (td中包含其他标签时)：

tr.xpath(".//td")    
tr.css("td")

取属性：

supply['file_url'] =tr.xpath(".//td[3]/a/@href")
supply['file_url'] =tr.css("a::attr(href)")

更多更详细请查文档

4.pipelines

log

import logging

logger = logging.getLogger('WLog')
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
fh = logging.FileHandler("error.log", "a+", "utf-8")
fh.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
fh.setFormatter(formatter)
# 给logger添加handler
# self.logger.addHandler(ch)
logger.addHandler(fh)

将结果以json格式保存到文件

import json
class FilePipeline(object):
    def __init__(self):
        self.file = open('items.jl', 'a+')

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

    def close_spider(self, spider):
        self.file.close()

保存到EXCEL

from openpyxl import Workbook
class ExcelPipeline(object):
    def __init__(self):
        try:
            self.wb = Workbook()
            # grab the active worksheet
            self.ws = self.wb.active
        except BaseException as e:
            logger.warn(e.__str__())

    def process_item(self, item, spider):
        try:
            self.ws.append([item['did'], item["company_name"], item["industry"], item["area"]])
        except BaseException as e:
            logger.warn(item["did"] + ":" + e.__str__())
        return item

    def close_spider(self, spider):
        try:
            self.wb.save(time.strftime("%y%m%d%H%M%S") + ".xlsx")
        except BaseException as e:
            logger.warn(e.__str__())

保存到数据库

import logging
import MySQLdb
# 保存到数据库
class MySQLPipeline(object):
    def __init__(self, db):
        self.db = db

    @classmethod
    def from_settings(cls, settings):
        db = MySQLdb.connect(settings["MYSQL_HOST"], settings["MYSQL_USER"],settings["MYSQL_PASSWD"],settings["MYSQL_DBNAME"], charset="utf8")
        return cls(db)

    # pipeline默认调用
    def process_item(self, item, spider):
        cursor = self.db.cursor()
        item["pid"] = int(item["pid"])
        if isinstance(item, HistoryItem):
            try:
                cursor.execute('insert into history(pid,name,v2013,v2014,v2015) VALUES (%d, "%s", "%s", "%s", "%s")' % (item['pid'], item["name"], item["v2013"], item["v2014"], item["v2015"]))
            except BaseException as e:
                self.db.rollback()
                logger.error(str(item["pid"]) + ":HistoryItem:" + e.__str__())
            pass
        elif isinstance(item, ForecastItem):
            cursor.execute("select id from forecast where pid = %d" % (item["pid"]))
            ret = cursor.fetchone()
            try:
                if ret:  # 已有pid 更新
                    cursor.execute('update  ..........'))
                else
                    cursor.execute('insert  ..........'))    
            except BaseException as e:
                self.db.rollback()
                logger.error(str(item["pid"]) + ":ForecastItem:" + e.__str__())
            pass
        else:
            pass

        return item

    def close_spider(self, spider):
        self.db.close()

当字段众多时如何更简便的保存到数据库？知道的留言告知下多谢。(17年8月28日更新：突然想到ORM,百度了一下果然python也有ORM框架SQLAlchemy、Django，以后可以试下怎么用)
拼接表的字段

SELECT group_concat(COLUMN_NAME) from  information_schema.COLUMNS where table_name = 'business_plan' and table_schema = 'test';//得到逗号分隔的字符串
SELECT group_concat(COLUMN_NAME SEPARATOR "='%s',") from  information_schema.COLUMNS where table_name = 'business_plan' and table_schema = 'test';

5 Settings (截取我设置了的)

# -*- coding: utf-8 -*-
BOT_NAME = 'wanglu'

SPIDER_MODULES = ['wanglu.spiders']
NEWSPIDER_MODULE = 'wanglu.spiders'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 1

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'wanglu.pipelines.SomePipeline': 300,
#}
#ITEM_PIPELINES = {'wanglu.pipelines.WangluPipeline': 300}
#ITEM_PIPELINES = {'wanglu.pipelines.MySQLPipeline': 400}
#ITEM_PIPELINES = {'wanglu.pipelines.ExcelPipeline': 300,'wanglu.pipelines.MySQLPipeline': 400}
#ITEM_PIPELINES = {'wanglu.pipelines.ExcelPipeline': 300}
ITEM_PIPELINES = {'wanglu.pipelines.MySQLPipeline2': 300}

# start MySQL database configure setting
MYSQL_HOST = 'localhost'
MYSQL_DBNAME = 'test'
MYSQL_USER = 'root'
MYSQL_PASSWD = '123123'
# end of MySQL database configure setting