通过框架将数据存储到数据库或者到本地
安装库,Mac 中brew 安装 python3.9
pip3 install scrapy
创建项目
scrapy startproject fistBlood
提示 cd fistBlood scrapy genspider first www.xx.com
修改配置settings.py
ROBOTSTXT_OBEY = False LOG_LEVEL = 'ERROR' USER_AGENT = 浏览器中
import scrapy
import re
class FirstSpider(scrapy.Spider):
name = 'first'
# allowed_domains = ['www.baidu.com']
start_urls = ['https://xx.com/']
url = 'https://xx/index/%d'
page_num = 2
def parse(self, response):
div_lsit = response.xpath('//div[@class="col-xlg videos-item"]/div[@node-type="video"]')
prefixUrl = response.xpath('//link[@rel="canonical"]/@href')[0].extract()
list_data = []
for item in div_lsit:
detailUrl = prefixUrl + item.xpath('./div[1]/a/@href')[0].extract()
title = item.xpath('./div[1]/a/@alt')[0].extract()
imgsmall = item.xpath('./div[1]/a/img/@src')[0].extract()
videoTime = item.xpath('./div[1]/a/span[@class="video-duration"]/text()')[0].extract()
numPerson = item.xpath('./div[2]/div[2]/span[1]/text()')[0].extract()
# print(detailUrl, title, imgsmall, videoTime, numPerson)
dic = {
'detailUrl': detailUrl,
'title': title,
'imgsmall': imgsmall,
'videoTime': videoTime,
'numPerson': numPerson,
}
list_data.append(dic)
if self.page_num <= 3:
new_url = format(self.url%self.page_num)
self.page_num += 1
yield scrapy.Request(url=new_url, callback=self.parse)
print(list_data)
return list_data
print('第一数据抓出完成')
分页数据抓取完成后存储
scrapy crawl first -o ./first.csv
存储为scv文件后可以之间导入数据库中 (直接创建一个sqlite3的表字段定义完成后)
管道存储
在itmes.py
class FistbloodItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
detailUrl = scrapy.Field()
title = scrapy.Field()
imgsmall = scrapy.Field()
videoTime = scrapy.Field()
numPerson = scrapy.Field()
在爬虫文件中导入 FistbloodItem相关操作
import scrapy
import re
from fistBlood.items import FistbloodItem
class SecondSpider(scrapy.Spider):
name = 'second'
# allowed_domains = ['www.xx.com']
start_urls = ['https://index/1']
def parse(self, response):
div_lsit = response.xpath('//div[@class="col-xlg videos-item"]/div[@node-type="video"]')
prefixUrl = response.xpath('//link[@rel="canonical"]/@href')[0].extract()
for item in div_lsit:
detailUrl = prefixUrl + item.xpath('./div[1]/a/@href')[0].extract()
title = item.xpath('./div[1]/a/@alt')[0].extract()
imgsmall = item.xpath('./div[1]/a/img/@src')[0].extract()
videoTime = item.xpath('./div[1]/a/span[@class="video-duration"]/text()')[0].extract()
numPerson = item.xpath('./div[2]/div[2]/span[1]/text()')[0].extract()
# print(detailUrl, title, imgsmall, videoTime, numPerson)
avitem = FistbloodItem()
avitem['detailUrl'] = detailUrl
avitem['title'] = title
avitem['imgsmall'] = imgsmall
avitem['videoTime'] = videoTime
avitem['numPerson'] = numPerson
yield avitem
print('第一数据抓出完成')
写出文件piplines.py
class FistbloodPipeline:
fp = None
def open_spider(self,spider):
print('开始-----')
self.fp = open('./cesreee.txt', 'w', encoding='utf-8')
def process_item(self, item, spider):
detailUrl = item['detailUrl']
title = item['title']
imgsmall = item['imgsmall']
videoTime = item['videoTime']
numPerson = item['numPerson']
self.fp.write(detailUrl+','+title+','+imgsmall+','+videoTime+','+numPerson+'\n')
return item
def close_spider(self,spider):
print('结束爬虫')
self.fp.close()
导入sqilte3数据库 创建表字段 在设置中开启
'fistBlood.pipelines.mysqlPileLine': 301
class mysqlPileLine(object):
def open_spider(self,spider):
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
db_path = os.path.join(BASE_DIR, "testav.db")
self.conn = sqlite3.connect(db_path)
def process_item(self,item,spider):
self.cur = self.conn.cursor()
insert_sql = 'insert into mma values (NULL , ?, ?, ?, ?, ?)'
self.cur.execute(insert_sql, (item['detailUrl'], item['title'], item['imgsmall'], item['videoTime'], item['numPerson']))
self.conn.commit()
return item
def close_spider(self,spider):
self.conn.close()
导入mysql数据库 工具manager-osx 开启后
在pycharm 中测试连接成功后 创建 字段 表 就完成了
如果出现找不到MySQL 重启就好了
class mysqlPileLine(object):
conn = None
cur = None
def open_spider(self, spider):
self.conn = pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='123456',db='course',charset='utf8')
def process_item(self, item, spider):
self.cur = self.conn.cursor()
insert_sql = 'insert into testav values (NULL ,"%s" , "%s" , "%s" , "%s" , "%s")'%(item['detailUrl'], item['title'], item['imgsmall'], item['videoTime'], item['numPerson'])
try:
self.cur.execute(insert_sql)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self, spider):
self.cur.close()
self.conn.close()
完成分页数据并写入到数据库中,详情页数据更新中