需要改动的东西和pymysql没有多大区别
其实python3.x也是可以使用mysqldb的,方法如下
准备
安装方法:pip install PyMySQL
_ init__.py中添加两行代码:
import pymysql
pymysql.install_as_MySQLdb()
- mysqldb操作数据库(settings设置不变)
其实和pymysql操作文章内的封装的数据库操作函数一样,这里只是做了多个表插入的扩展
- dbhelper.py
#coding:utf-8
import MySQLdb
from scrapy.utils.project import get_project_settings # 引入settings配置
class DBHelper():
def __init__(self):
self.settings = get_project_settings() # 获取settings配置数据
self.host = self.settings['MYSQL_HOST']
self.port = self.settings['MYSQL_PORT']
self.user = self.settings['MYSQL_USER']
self.passwd = self.settings['MYSQL_PASSWD']
self.db = self.settings['MYSQL_DBNAME']
# 连接mysql
def connectMysql(self):
conn = MySQLdb.connect(host=self.host,
port=self.port,
user=self.user,
passwd=self.passwd,
charset='utf8')
return conn
# 连接数据库
def connectDatabase(self):
conn = MySQLdb.connect(host=self.host,
port=self.port,
user=self.user,
passwd=self.passwd,
db=self.db,
charset='utf8')
return conn
# 创建数据库
def createDatabase(self):
conn = self.connectMysql()
sql = "create database if not exists " + self.db
cur = conn.cursor()
cur.execute(sql)
cur.close()
conn.close()
# 创建数据表
def createTable(self, sql):
conn = self.connectDatabase()
cur = conn.cursor()
cur.execute(sql)
cur.close()
conn.close()
# 插入数据
def insert(self, sql,*params):
conn = self.connectDatabase()
cur = conn.cursor(); #这里;是数据库操作语句结束符号
cur.execute(sql,params)
conn.commit()
cur.close()
conn.close()
# 更新数据
def update(self, sql, *params):
conn = self.connectDatabase()
cur = conn.cursor()
cur.execute(sql, params)
conn.commit()
cur.close()
conn.close()
# 删除数据
def delete(self, sql, *params):
conn = self.connectDatabase()
cur = conn.cursor()
cur.execute(sql, params)
conn.commit()
cur.close()
conn.close()
# 测试数据库操作
class TestDBHelper():
def __init__(self):
self.dbHelper = DBHelper()
#创建数据库
def testCreateDatebase(self):
self.dbHelper.createDatabase()
#创建表操作
def testCreateTable(self):
sql = "create table if not exists test2(id int primary key auto_increment,new1 varchar(50),new2 varchar(200))"
self.dbHelper.createTable(sql)
#插入数据,这里可以针对不同的表插入数据,piplines传递过来的item存储的是所有数据,可以分类插入不同的表
#pymysql中也可以在这里对不同的表进行操作,不许需要考虑item数据的传递分类,根据自己的需要取就行,根据需要存即可
def testInsert(self,item):
sql1 = "insert into testtable(name,url) values(%s,%s)"
params1 = (item['name'],item['url'])
self.dbHelper.insert(sql1,*params1)
sql2 = "insert into test2(new1,new2) values(%s,%s)"
params2 = (item['new1'], item['new2'])
self.dbHelper.insert(sql2,*params2)
#更新操作
# def testUpdate(self):
# sql = "update testtable set name=%s,url=%s where id=%s"
# params = ("update", "update", "1")
# self.dbHelper.update(sql, *params)
#
# def testDelete(self):
# sql = "delete from testtable where id=%s"
# params = ("1")
# self.dbHelper.delete(sql, *params)
#
#调用测试
if __name__ == "__main__":
# testDBHelper = TestDBHelper()
# testDBHelper.testCreateDatebase()
# testDBHelper.testCreateTable()
# testDBHelper.testInsert()
# testDBHelper.testUpdate()
# testDBHelper.testDelete()
- piplines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from twisted.enterprise import adbapi
import MySQLdb
import MySQLdb.cursors
from scrapy_mysql_demo.db.dbhelper import *
import codecs
import json
from logging import log
from scrapy.utils.project import get_project_settings
class WebcrawlerScrapyPipeline(object):
# 连接数据库
def __init__(self):
self.db = TestDBHelper()
#处理存数据入数据库
def process_item(self, item, spider):
self.db.testCreateTable()
# 插入数据库
self.db.testInsert(item)
return item
显著提升爬虫效率的方式:
1.换个性能更好的机器
2.网络使用光纤
3.多线程
4.多进程
5.分布式
6.提升数据的写入速度反爬虫的应对措施:
1.随机修改User-Agent
2.禁用Cookie追踪
3.放慢爬虫速度
4.使用代理,动态更换ip(电脑ip不变,动态IP指的是代理的IP)
5.分布式(一般用不同区域的电脑,不能在一个局域网)