1,pipeline.py
class GuangdongPipeline:
mysql = None
cursor = None
num = 1
jd_num = 1
def __init__(self, host, user, password, charset, port, database):
self.host = host
self.user = user
self.password = password
self.charset = charset
self.port = port
self.database = database
print("数据库初始化")
@classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get("MYSQL_HOST"),
user=crawler.settings['MYSQL_USER'],
password=crawler.settings['MYSQL_PASSWORD'],
charset=crawler.settings['MYSQL_CHARSET'],
database=crawler.settings['MYSQL_DBNAME'],
port=crawler.settings['MYSQL_PORT']
)
def open_spider(self, spider):
self.mysql = pymysql.Connect(host=self.host, user=self.user, password=self.password,
port=self.port, charset=self.charset, database=self.database)
self.cursor = self.mysql.cursor()
def process_item(self, items, spider):
# print(items)
if isinstance(items, GuangdongItem):
print(1)
self.Pprocess_item(items)
elif isinstance(items, IamgesItem):
print(2)
return None
# return self.handle_image_item(items)
elif isinstance(items, GuangDongExItem):
print(3)
self.Exprocess_item(items)
class MyFilesPipeline(ImagesPipeline):
# store_uri:存储路径,download_func:下载函数,settings 表示设置
def __init__(self, store_uri, download_func=None, settings=None):
print("初始化")
# 调用父类的初始化方法,传递相应的参数。
super().__init__(store_uri, download_func, settings)
self.sftp = None
self.transport = None
# 可以在不创建实例的情况下直接通过类来调用
@classmethod
def from_settings(cls, settings):
store_uri = settings['FILES_STORE']
return cls(store_uri)
def open_spider(self, spider):
settings = get_project_settings()
host = settings.get('REMOTE_HOST', None)
port = settings.get('REMOTE_PORT', 22)
username = settings.get('REMOTE_USERNAME', None)
password = settings.get('REMOTE_PASSWORD', None)
# 判断给定的可迭代参数 iterable 中的所有元素是否都为 TRUE,元素除了是 0、空、None、False 外都算 True
if not all([host, username, password]):
raise NotConfigured('Missing remote connection settings.')
self.transport = paramiko.Transport((host, port))
self.transport.connect(username=username, password=password)
self.sftp = SFTPClient.from_transport(self.transport)
self.spiderinfo = self.SpiderInfo(spider)
print("连接成功")
def file_path(self, request, response=None, info=None, *, item=None):
# 可以在这里自定义文件名格式和存储路径
# 存储路径以item的category字段为文件夹名,文件名使用item的title字段
if isinstance(item, IamgesItem):
print(item)
print(item['image_name'])
foldername = item['image_name']
filename = request.url[-20:].replace('/','-')
return f'/{foldername}/{filename}'
def item_completed(self, results, item, info):
# 获取写入图片的本地路径
print('获取写入图片的本地路径')
settings = get_project_settings()
for ok, result in results:
file_path = result['path']
# name = item['image_name']
local_path = settings['FILES_STORE'] + file_path
# dir_path = '/home/user/images'
dir_path ='/usr/share/nginx/html/images'
try:
# 判断文件是否存在
self.sftp.stat(f'{dir_path}/{file_path[1:33]}')
except IOError:
# 目录不存在,则创建
self.sftp.chdir('..')
self.sftp.mkdir(f'{dir_path}/{file_path[1:33]}')
# dir_list = self.sftp.listdir()
# print(dir_list)
# 连接到远程服务器并写入文件
self.sftp.put(local_path, dir_path + file_path)
# 返回结果
return super().item_completed(results, item, info)
def close_spider(self, spider):
if self.sftp:
self.sftp.close()
if self.transport:
self.transport.close()
2,setting.py
ITEM_PIPELINES = {
'guangdong.pipelines.MyFilesPipeline': 300,
'guangdong.pipelines.GuangdongPipeline': 302,
# 'scrapy.pipelines.images.ImagesPipeline': 5
}
# 自定义路径
FILES_STORE = 'local/images'
REMOTE_HOST = 服务器IP
REMOTE_PORT = 端口号
REMOTE_USERNAME = 用户名
REMOTE_PASSWORD = 用户密码
3,pachong.py
import copy
import datetime
import logging
import math
import random
import re
import uuid
import scrapy
import json
from pymysql.converters import escape_string
from scrapy import Selector
from guangdong.items import GuangdongItem, IamgesItem
from guangdong.spiders.mysqlcon import is_news_not_saved
class Gd341gdyxjdSpider(scrapy.Spider):
name = 'gd3_41gdyxjd'
allowed_domains = ['允许的爬取域名']
start_urls = ['url']
handle_httpstatus_list = [404, 500]
num = 0
fnum = 0
def parse(self, response):
join_url = response.request.url
if response.status in self.handle_httpstatus_list:
yield scrapy.Request(url=join_url, callback=self.after_404, dont_filter=True)
items = GuangdongItem()
images = IamgesItem()
# items = response.meta['items']
items['url'] = 'hh'
items['title'] = 'hh'
items['publish_date'] = ''
items['referer'] = 'hh'
items['ministry'] = 'hh'
items['netloc'] = 'hh'
policy_id = ''.join(str((uuid.uuid5(uuid.NAMESPACE_DNS, str(uuid.uuid1()) + str(random.random)))).split('-'))
contents = response.css('.article-content')
if not contents:
contents = response.css('div.gz_content')
if not contents:
contents = response.css('div.content-box')
content = ''.join(contents.getall())
a = []
for img in contents.css("img"):
image = img.xpath("@src").get()
if image.startswith(("data")):
continue
a.append(response.urljoin(img.xpath("@src").get()))
url = response.urljoin(image)
src_regex = r'src=[\'"]?([^\'" >]+)'
new_tag = re.sub(src_regex, f'src="路径/{policy_id}/{url[-20:].replace("/","-")}"', img.get())
content = content.replace(img.get(), new_tag)
images['image_urls'] = a
images['image_name'] = policy_id
# print(images)
yield images
items['content'] = escape_string(content)
items['is_explanation'] = 1
items['policy_id'] = policy_id
items['rating'] = '省'
items['district'] = ''
# 市
items['city'] = ''
items['province'] = '....'
items['encoding'] = 'UTF-8'
items['grab_time'] = datetime.datetime.now()
yield items
def after_404(self, response):
print(f"{self.num}error:失效链接:" + response.request.url)
self.fnum += 1
logging.info(f"{self.num}error:失效链接:" + response.request.url)
pass
def abnormal(self, response):
self.num += 1
print(f"{self.num}error:异常链接:" + response.request.url)
logging.info(f"{self.num}error:异常链接:" + response.request.url)
pass