编写流程,1、定义Items模型 即数据模型类型java实体对象 2、编写Pipeline管道,该模块持久化数据库的 3、编写爬虫类解析爬虫数据,根据Items定义的模型封装数据,提交至Pipeline管道写入数据库
一、定义Items模型
class Cults3DItem(scrapy.Item):
model_url_init = scrapy.Field()
status = scrapy.Field()
url_status = scrapy.Field()
create_time = scrapy.Field()
update_time = scrapy.Field()
二、编写Pipeline管道
class Cults3DPipeline:
mysql = None
cursor = None
def open_spider(self, spider):
self.mysql = pymysql.Connect(host=spider.settings.get('MODEL_HOST'),
port=spider.settings.get('MODEL_PORT'),
user=spider.settings.get('MODEL_USER'),
password=spider.settings.get('MODEL_PASSWORD'),
db=spider.settings.get('MODEL_DBNAME'))
self.cursor = self.mysql.cursor()
def process_item(self, item, spider):
if spider.name == 'cults3d':
insert_sql = "insert into ods_cults3d(model_url,status,create_time,update_time,url_status) values('{}'," \
"'{}','{}','{}','{}')".format(
item['model_url_init'], item['status'], item['create_time'], item['update_time'], item['url_status'])
try:
self.cursor.execute(insert_sql)
self.mysql.commit()
print('insert model url data success')
except Exception as e:
print(e)
self.mysql.rollback()
print('insert data fail')
第三、解析网站数据
class TestSpider(scrapy.Spider):
name = "xxx"
allowed_domains = ["xxx.com"]
def select_url(self):
cursor = self.mysql.mysql_cursor()
select_sql = "select model_url from ods_cults3d"
cursor.execute(select_sql)
return cursor.fetchall()
def start_requests(self):
headers = {
'user_agent': UserAgent().random
}
start_urls = "初始请求url"
yield scrapy.Request(url=start_urls,
callback=self.parse,
headers=headers,
dont_filter=True) #设置为True,方式请求被过滤
def parse(self, response)
if response.status == 200:
解析数据
create_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
list_article = response.xpath('//*[@id="content"]/div[3]/article')
url = article.xpath('./div/a/@href').extract()
item = Cults3DItem()
item['url'] = url
yield item #提交给管道
else:
print("data coming!!!", response.status)