怎么用Scrapy爬取网站图片?
文章目录
前言
使用Scrapy爬取网站图片并下载(使用XPATH路径来获取图片链接)。
对爬取成功的图片连接入库。
目录结构
一、运行环境
1. win10
2.python==3.7.2
2.scrapy:2.4.1
二、使用步骤
1. settings设置
- 必须设置好这个参数(IMAGES_STORE),这里是默认使用桌面路径
# 文件管道(默认桌面路径)
FILES_STORE = rf"{
os.path.join(os.path.expanduser('~'), 'Desktop')}\{
BOT_NAME}"
# 图片管道(默认桌面路径)
IMAGES_STORE = rf"{
os.path.join(os.path.expanduser('~'), 'Desktop')}\{
BOT_NAME}"
# 最小图片高度
IMAGES_MIN_HEIGHT = 0
# 最小图片宽度度
IMAGES_MIN_WIDTH = 0
- SQL数据库连接参数
# 地址
HOST_SQL = '127.0.0.1'
# 端口
PORT_SQL = '3306'
# 用户名
USER_SQL = 'root'
# 密码
PASSWORD_SQL = '123456'
# 数据库
DB_SQL = 'scrapyspider'
# 数据表
TABLE_SQL = 'scrapy_media'
# 错误数据表
TABLE_ERROR_SQL = 'spider_error'
2.middlewares设置
- 随机User-Agent
class RandomUserAgentMiddleware:
"""
1、添加user-agent
"""
def __init__(self):
self.agent = UserAgent()
@classmethod
def from_crawler(cls, crawler):
return cls()
def process_request(self, request, spider):
request.headers.setdefault('User-Agent', self.agent.random)
request.headers['referer'] = request.url
3. pipelines设置
- 获取settings中的SQL参数并连接数据库
def __init__(self, hotsSql: str, portSql: str, userSql: str, passwordSql: str, dbSql: str, tableSql: str, tableErrorSql: str):
"""
1、实例化公共参数
:param hotsSql: 地址
:param portSql: 端口
:param userSql: 用户名
:param passwordSql: 密码
:param dbSql: 数据库
:param tableSql: 数据表
:param tableErrorSql: 错误数据表
"""
# scrapy数据库链接
self.mySQL = MySQLdb.connect(hotsSql, userSql, passwordSql, dbSql, charset='utf8mb4', port=int(portSql))
self.myCursor = self.mySQL.cursor()
self.tableSql = tableSql
self.tableErrorSql = tableErrorSql
@classmethod
def from_crawler(cls, crawler):
hotsSql = crawler.settings['HOST_SQL']
portSql = crawler.settings['PORT_SQL']
userSql = crawler.settings['USER_SQL']
passwordSql = crawler.settings['PASSWORD_SQL']
dbSql = crawler.settings['DB_SQL']
tableSql = crawler.settings['TABLE_SQL']
tableErrorSql = crawler.settings['TABLE_ERROR_SQL']
s = cls(hotsSql, portSql, userSql, passwordSql, dbSql, tableSql, tableErrorSql)
return s
- 存入数据库中
def process_item(self, item, spider):
# 序列化
items = json.dumps([ItemAdapter(item).asdict()])
# 插入
self.myCursor.execute(
f"""insert into {
self.tableSql}(url,media_url,items) values ("""
f"""%s,%s,%s)""",
[item['url'], item['mediaUrl'], items]
)
self.mySQL.commit()
return item
- 重写ImagesPipeline
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
"""
1、获取需要下载的图片URL进行下载
:param item:
:param info:
:return:
"""
yield scrapy.Request(url=item['mediaUrl'], dont_filter=True)
def file_path(self, request, response=None, info=None, *, item=None):
"""
1、自定义文件路径
:param request:
:param response:
:param info:
:param item:
:return:
"""
# 图片下载路径(默认桌面路径)
self.imagesStore = self.store.basedir if item['mediaDownloadPath'] == '' else item['mediaDownloadPath']
# 保证是一个目录
if (os.path.splitext(self.imagesStore))[1] != '':
self.imagesStore = (os.path.splitext(self.imagesStore))[0]
if not os.path.isdir(self.imagesStore):
os.mkdir(self.imagesStore)
# 图片名称(默认使用时间戳)
iamgeName = str(time.time())[:10] if item['mediaName'] == '' else str(item['mediaName'])
# 图片后缀(默认使用链接中的后缀)
iamgeNameSuf = (os.path.splitext((urlparse(request.url)).path))[1