19-Python Scrapy入门

最新推荐文章于 2024-09-15 22:31:42 发布

chitian6393

最新推荐文章于 2024-09-15 22:31:42 发布

阅读量196

点赞数

文章标签： python 数据库爬虫

原文链接：https://my.oschina.net/CoderW/blog/1305412

版权

1.创建项目

①、在想要创建目录的：scrapy startproject projectname

②、项目初始模板：

A：spider-爬虫对象<发送请求、解析数据>

B：Item-抓取数据的模型<字典>，可以扩展ItemLoader对原始数据进行二次加工。

C：middlewares-各种高级配置对象<还未深入学习>

D：pipelines-接收并校验数据，存储数据

E：settings-对爬虫进行全局配置

F：scrapy.cfg

2.常用scrapy命令

# 初始化项目
scrapy startproject project name

# 运行爬虫项目(注意这里的spidername爬虫对象Spider中的name字段值,同一工程中的spidername不允许重复)
scrapy crawl spidername

# 直接运行创建的爬虫
scrapy runspider spidername

# 查看当前项目中有多少爬虫
scrapy list

# 使用代码运行爬虫
from scrapy import cmdline
cmdline.execute("scrapy crawl spidername".split())

注：https://doc.scrapy.org/en/latest/topics/items.html

3.Crontab

    ①、Crontab命令是Unix系统和类Unix系统中，用来设置周期性执行的指令。
    ②、该命令从标准输入设备读取指令，并将其存放在“Crontab”文件中，以供后期读取和执行。
    ③、Crontab所存的指令，被守护进程激活。
    ④、crond常常在后台运行，以一分钟为间隔检查是否有预定的作业需要执行，这类作业一般成为cron jobs。

# 配置规则

# 1 * * * *     每分钟执行一次
# 0 * * * *     每小时执行一次
# 0 0 * * *     每天执行一次
# 0 0 * * 0     每周执行一次
# 0 0 1 * *     每月执行一次
# 0 0 1 1 *     每年执行一次

配置步骤：

①、crontab -e

②、添加自己的定时任务

4.实战scrapy小练习

淘宝MM信息小爬虫

①、Spiders

#!/usr/bin/python
# -*- coding: UTF-8 -*-

__author__ = 'apple'

import scrapy
from scrapy import Spider,cmdline
from ScrapyTemplate.items import TBMMInfo

from scrapy.selector import Selector,SelectorList
from scrapy.loader import ItemLoader,processors
from scrapy.http.response.html import HtmlResponse

import re,time

# 爬取淘宝MM的数据
class TBMMSpider(Spider):
    # 用于区别Spider，该名字必须是唯一的，您不可以为不同的Spider设定相同的名字
    name = 'TBMM'
    # 请求时间间隔
    download_delay = 3
    # 请求页码范围
    page_index = 0
    max_index = 1
    # 允许访问的域名
    allowed_domain = ['mm.taobao.com']
    # 包含了Spider在启动时进行爬取的url列表®
    # start_urls = [ ]
    # 构造请求
    def start_requests(self):
        self.page_index += 1
        results = []
        if self.page_index <= self.max_index:
            new_url = 'https://mm.taobao.com/json/request_top_list.htm?page=%s'%(self.page_index)
            print('新的请求:',new_url)
            results.append(scrapy.Request(new_url,callback=self.parse))
        return results

    # 解析方法：parse() 是spider的一个方法
    def parse(self, response):
        for person in response.xpath('/html/body/div[@class="list-item"]'):
            tbMM = TBMMInfo()
            ######  CSS Selector解析
            # 用户基础信息
            person_info = person.css('div.personal-info>div.pic-word')
            # 用户扩展信息
            list_info = person.css('div.list-info')

            tbMM['name'] = person_info.css('p.top>a::text').extract_first()
            tbMM['userid'] = person_info.css('p.top>span.friend-follow::attr(data-userid)').extract_first()
            tbMM['homepage'] = 'http:'+person_info.css('p.top>a::attr(href)').extract_first()
            tbMM['avater'] = 'http:'+person_info.css('div.pic>a>img::attr(src)').extract_first()
            tbMM['age'] = person_info.css('p.top>em>strong::text').extract_first()
            # 匿名类：last-child nth-child(num)
            tbMM['city'] = person_info.css('p.top>span::text').extract_first()
            tbMM['jobs'] = person_info.css('p:last-child>em::text').extract_first()
            tbMM['fans'] = person_info.css('p:last-child>em:last-child>strong::text').extract_first()
            rank = list_info.css('div.popularity>dl>dt::text').extract_first()
            tbMM['rank'] = re.sub('\s','',rank)
            total_integral = list_info.css('div.popularity>dl>dd::text')[1].extract()
            tbMM['total_integral'] = re.sub('\s','',total_integral)
            # 信息列表
            ul_list = ['add_integral','feedback_rate','image_count','sign_count']
            for i in range(1,len(ul_list) + 1):
                css_str = 'ul.info-detail>li>strong::text' if i == 1 else 'ul.info-detail>li:nth-child(%s)>strong::text'%(i)
                value = list_info.css(css_str).extract_first()
                if re.sub('\D','',value) == '':
                    value = '0'
                tbMM[ul_list[i - 1]] = value
            tbMM['description'] = list_info.css('p::text').extract_first().strip()
            tbMM['insert_date'] = str(time.time())

            ######  XPath Selector解析
            # person_info = person.xpath('div[@class="personal-info"]/div[@class="pic-word"]')
            # list_info = person.xpath('div[@class="list-info"]')
            #
            # tbMM['name'] = person_info.xpath('p[@class="top"]/a/text()').extract_first()
            # tbMM['userid'] = person_info.xpath('p[@class="top"]/span[@class="friend-follow J_FriendFollow"]/@data-userid').extract_first()
            # tbMM['homepage'] = person_info.xpath('p[@class="top"]/a/@href').extract_first()
            # tbMM['avater'] = person_info.xpath('div[@class="pic s60"]/a/img/@src').extract_first()
            # tbMM['age'] = person_info.xpath('p[@class="top"]/em/strong/text()').extract_first()
            # tbMM['city'] = person_info.xpath('p[@class="top"]/span[1]/text()').extract_first()
            # tbMM['jobs'] = person_info.xpath('p[last()]/em/text()').extract_first()
            # tbMM['fans'] = person_info.xpath('p[last()]/em[last()]/strong/text()').extract_first()
            # rank = list_info.xpath('div[@class="popularity"]/dl/dt/text()').extract_first()
            # tbMM['rank'] = re.sub('\s', '', rank)
            # total_integral = list_info.xpath('div[@class="popularity"]/dl/dd/text()')[1].extract()
            # tbMM['total_integral'] = re.sub('\s', '', total_integral)
            # ul_list = ['add_integral', 'feedback_rate', 'image_count', 'sign_count']
            # for i in range(1, len(ul_list) + 1):
            #     tbMM[ul_list[i - 1]] = list_info.xpath('ul[@class="info-detail"]/li[%s]/strong/text()'%(i)).extract_first()
            # tbMM['description'] = list_info.xpath('p/text()').extract_first().strip()

            # for key,value in tbMM.items():
            #     print(key,'=',value)

            yield tbMM

        # yield Request:返回请求对象
        requests = self.start_requests()
        if requests: yield requests[0]

    @classmethod
    def parser_list(cls,response):
        ######  XPath Selector解析使用itemLoader
        # 用户基础信息
        person_info = 'div.personal-info>div.pic-word>'
        # 用户扩展信息
        list_info = 'div.list-info>'
        # 初始化loader
        loader = ItemLoader(item=TBMMInfo(), response=response)
        loader.add_css('name', person_info + 'p.top>a::text')
        loader.add_css('userid', person_info + 'p.top>span.friend-follow::attr(data-userid)')
        loader.add_css('homepage', person_info + 'p.top>a::attr(href)')
        loader.add_css('avater', person_info + 'div.pic>a>img::attr(src)')
        loader.add_css('age', person_info + 'p.top>em>strong::text')
        # 匿名类：last-child nth-child(num)
        loader.add_css('city', person_info + 'p.top>span::text')
        loader.add_css('jobs', person_info + 'p:last-child>em::text')
        loader.add_css('fans', person_info + 'p:last-child>em:last-child>strong::text')
        loader.add_css('rank', list_info + 'div.popularity>dl>dt::text')
        loader.add_css('total_integral', list_info + 'div.popularity>dl>dd::text')
        # 信息列表
        ul_list = ['add_integral', 'feedback_rate', 'image_count', 'sign_count']
        for i in range(1, len(ul_list) + 1):
            loader.add_css(ul_list[i - 1],
                           list_info + 'ul.info-detail>li>strong::text' if i == 1 else list_info + 'ul.info-detail>li:nth-child(%s)>strong::text' % (
                           i))
        loader.add_css('description', list_info+'p::text')

        return loader.load_item()


if __name__ == "__main__":

    print('TBMMSpider__Main')
    '''
    # 迭代器
    def diedai_num():
        for i in range(4):
            yield i
            i += 1
        for m in range(3):
            yield str(m)+'TRUE'
            m += 1

    iter = iter(diedai_num())
    print(type(iter))
    x = 1
    nums = []
    strs = []
    while True:
        try:
            value = next(iter)
            if isinstance(value,int):
                nums.append(value)
            elif isinstance(value,str):
                strs.append(value)
            print('第%d次'%(x),value,'\n')
            x += 1
        except StopIteration:
            print('迭代完成!!!')
            break
    print('获取到的数字结果:',nums)
    print('获取到的字符串结果:',strs)
    '''

②、items

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Field,Item

########### 对象模型类
class TBMMInfo(Item):

    # Item：一个包含了item所有声明的字段的字典，不仅仅是获取到的字段。
    #              该字典的key是字段(field)的名字，值是 Item声明 中使用到的 Field 对象。
    # Field：仅仅是内置字典的一个别名

    userid = Field()     # 用户ID
    name = Field()       # 姓名
    homepage = Field()   # 个人主页
    avater = Field()     # 头像
    age = Field()        # 年龄
    city = Field()       # 城市
    jobs = Field()       # 职业
    fans = Field()       # 粉丝数

    rank = Field()            # 排行
    total_integral = Field()  # 总积分
    add_integral = Field()    # 新增积分
    feedback_rate = Field()   # 好评率
    image_count = Field()     # 导购照片
    sign_count = Field()      # 签约数量
    description = Field()     # 描述

    insert_date = Field()     # 插入时间


# 在Item Loader 中声明的 field-specific 属性: field_in and field_out (most precedence)
# Item中的字段元数据(input_processor and output_processor key)
# Item Loader 默认处理器: ItemLoader.default_input_processor() and ItemLoader.default_output_processor() (least precedence)

③、pipelines

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

from ScrapyTemplate.items import TBMMInfo
from ScrapyTemplate import settings

from scrapy import Item
from scrapy.exceptions import DropItem
import pymysql

class DBManagerHelp(object):

    def __init__(self):
        # 数据库连接参数
        self.db_configs = settings.DEFAULT_DB_CONFIG
        # 数据库
        self.connect(**self.db_configs)
        # 数据源
        self.data_source = []

    # 连接数据库
    def connect(self,**kwargs):
        try:
            self._SQL_DB = pymysql.Connect(**kwargs)
            self._DB_CURSOR = self._SQL_DB.cursor()
            print('DBManagerHelp数据库连接成功',self._SQL_DB)
        except Exception as e:
            self._SQL_DB = None
            self._DB_CURSOR = None
            print(repr(e), 'ERROR')

    @property
    def sql_db(self):
        if self._SQL_DB is None : self.connect(self.db_configs)
        return self._SQL_DB

    @property
    def db_cursor(self):
        if self._DB_CURSOR is None: self.connect(self.db_configs)
        return self._DB_CURSOR

    # 创建表
    def create_table(self,name,fields={}):
        if not fields :
            print('DBManagerHelp建表失败:',name)
            return
        fields_sql = ''
        items = fields.items() if isinstance(fields,dict) else fields
        for key, value in items:
            fields_sql += (str.upper(key) + ' ' + str.upper(value) + ',')
        # 建表sql
        table_sql = '''CREATE TABLE IF NOT EXISTS %s(%s)'''%(name,fields_sql[:-1])
        try:
            # 执行建表语句
            self.db_cursor.execute(table_sql)
            self.commit_all()
        except Exception as e:
            print(repr(e), 'ERROR')

    # 建表参数<留待子类重写>
    def table_infos(self):
        return '',{}

    # 批量插入到数据库
    def insert_items(self,item_source=[]):
        if not item_source : return
        # 获取表名和字段名
        table_name,fields = self.table_infos()
        # 需要插入的字段
        insert_keys = fields.keys()
        # 要插入的列表
        values_list = []
        for item in item_source:
            value_tuple = []
            for field in insert_keys:
                lower_field = str.lower(field)
                if field in item.keys():
                    value_tuple.append(item[field])
                elif lower_field in item.keys():
                    value_tuple.append(item[lower_field])
            if not value_tuple:continue
            values_list.append(tuple(value_tuple))
        if not values_list : return
        key_sql = ','.join(insert_keys)
        value_sql = ','.join(['%s' for i in range(len(insert_keys))])
        update_sql = ','.join([field + '=VALUES(' + field + ')' for field in insert_keys])
        # 插入语句
        insert_sql = r"""INSERT INTO %s(%s) VALUES (%s) ON DUPLICATE KEY UPDATE %s"""%(table_name,key_sql,value_sql,update_sql)
        try:
            self.db_cursor.executemany(insert_sql,values_list)
            self.data_source.clear()
            self.commit_all()
        except Exception as e:
            print('数据插入失败:', repr(e))

    # 提交到数据库
    def commit_all(self,close=False):
        try:
            self.sql_db.commit()
        except Exception as e:
            print(repr(e), 'ERROR')
            # 事务回滚
            self.sql_db.rollback()
        if close:
            self.db_cursor.close()
            self.sql_db.close()

    ################# scrapy的回调方法 #################
    def open_spider(self, spider):
        print('DBManagerHelp开始:', spider)
        # 创建表格
        table_name,fields = self.table_infos()
        self.create_table(table_name,fields)

    def process_item(self, item, spider):
        if len(self.data_source) > settings.MAX_INSERTS_ITEM:
            print('批量插入数据：',settings.MAX_INSERTS_ITEM,'条')
            self.insert_items(self.data_source)

    def close_spider(self, spider):
        print('DBManagerHelp关闭:', spider)
        # 插入数据
        self.insert_items(self.data_source)
        # 事务提交或者回滚
        self.commit_all(close=True)


class TBMMInfoPipeline(DBManagerHelp):

    def table_infos(self):
        return 'TBMM_TABLE',{'USERID': 'VARCHAR(20) NOT NULL PRIMARY KEY',
                            'NAME': 'VARCHAR(20)',
                            'HOMEPAGE': 'VARCHAR(100)',
                            'AVATER': 'VARCHAR(100)',
                            'AGE': 'INT',
                            'CITY': 'VARCHAR(10)',
                            'JOBS': 'VARCHAR(40)',
                            'FANS': 'INTEGER',
                            'RANK': 'INTEGER',
                            'TOTAL_INTEGRAL': 'INTEGER',
                            'ADD_INTEGRAL': 'INTEGER',
                            'FEEDBACK_RATE': 'INTEGER',
                            'IMAGE_COUNT': 'INTEGER',
                            'SIGN_COUNT': 'INTEGER',
                            'DESCRIPTION': 'VARCHAR(400)',
                            'INSERT_DATE': 'INTEGER'
        }

    # 数据插入失败: InternalError(1366, "Incorrect integer value: '—' for column 'FEEDBACK_RATE' at row 1")
    # SQL语句在windows下不区分大小写，在Linux下数据库名、表名、字段名区分大小写

    def process_item(self, item, spider):
        super(TBMMInfoPipeline,self).process_item(item,spider)
        # 淘宝MM数据
        if isinstance(item,TBMMInfo) and item['userid'] is not None:
            self.data_source.append(item)
            return item
        else:
            raise DropItem('UserId is nil')

④、settings

# -*- coding: utf-8 -*-

# Scrapy settings for ScrapyTemplate project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

import random

BOT_NAME = 'ScrapyTemplate'

SPIDER_MODULES = ['ScrapyTemplate.spiders']
NEWSPIDER_MODULE = 'ScrapyTemplate.spiders'

# 日志级别
# CRITICAL - 严重错误(critical)
# ERROR - 一般错误(regular errors)
# WARNING - 警告信息(warning messages)
# INFO - 一般信息(informational messages)
# DEBUG - 调试信息(debugging messages)
LOG_LEVEL = 'ERROR'

# Obey robots.txt rules
# 通俗来说，robots.txt 是遵循 Robot协议 的一个文件，它保存在网站的服务器中。
# 它的作用是，告诉搜索引擎爬虫，本网站哪些目录下的网页 不希望 你进行爬取收录
ROBOTSTXT_OBEY = False

# 禁止重定向
# REDIRECT_ENABLED = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)<全局并发数>
CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3      # 下载延迟
DOWNLOAD_TIMEOUT = 15   # 下载超时
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)<禁止/启用Cookie>
COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  # 'Accept-Language': 'en',
   "Accept": "*/*",
   'Referer': 'http://www.baidu.com',
   'Accept-Encoding':'UTF-8'
}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
   'ScrapyTemplate.middlewares.ScrapytemplateSpiderMiddleware': 543,
}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'ScrapyTemplate.middlewares.MyCustomDownloaderMiddleware': 543,
# }

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines<存储器部件>
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'ScrapyTemplate.pipelines.TBMMInfoPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# 数据库链接默认配置
# '''
# host = None, user = None, password = "",
# database = None, port = 0, unix_socket = None,
# charset = '', sql_mode = None,
# read_default_file = None, conv = None, use_unicode = None,
# client_flag = 0, cursorclass = Cursor, init_command = None,
# connect_timeout = 10, ssl = None, read_default_group = None,
# compress = None, named_pipe = None, no_delay = None,
# autocommit = False, db = None, passwd = None, local_infile = False,
# max_allowed_packet = 16 * 1024 * 1024, defer_connect = False,
# auth_plugin_map = {}, read_timeout = None, write_timeout = None,
# bind_address = None
# '''
DEFAULT_DB_CONFIG = {
     'host':'127.0.0.1',
     'port':3306,
     'user':'root',
     'passwd':'apple',
     'db':'TESTDB',
     'charset':'utf8',
     'use_unicode':True,
     'connect_timeout':20
}

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENTS = ['Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
              'Opera/9.25 (Windows NT 5.1; U; en)',
              'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
              'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
              'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
              'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
              "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
              "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
              'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
              'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
              'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
              'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
              'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
              'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
              'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
              'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
              'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
              'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
              'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36',
              ]
USER_AGENT = random.choice(USER_AGENTS)


# 数据库批量插入最大数值
MAX_INSERTS_ITEM = 100

⑤、数据库