安装scrapy pip3 install scrapy 创建项目 scrapy startproject fistBlood 提示 cd fistBlood scrapy genspider first www.xx.com 执行工程 scrapy crawl spderName scrapy crawl first
直接上代码开启管道
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
USER_AGENT = 请求中有
ITEM_PIPELINES = {
'secBlood.pipelines.SecbloodPipeline': 300,
'secBlood.pipelines.mysqlPileLine': 301
}
创建的爬虫文件
import scrapy
from secBlood.items import SecbloodItem
class SecbSpider(scrapy.Spider):
name = 'secb'
# allowed_domains = ['www.xx.com']
start_urls = ['https://xxx.com/']
url = 'https://xxx.com/videos/index/%d'
page_num = 2
def parse_detail(self,response):
videoUrl = response.xpath('//script[8]/text()').re(r"url: '(.*?)'")[0]
imgBig = response.xpath('//script[8]/text()').re(r"pic : '(.*?)'")[0]
item = response.meta['item']
item['videoUrl'] = videoUrl
item['imgBig'] = imgBig
yield item
def parse(self, response):
div_lsit = response.xpath('//div[@class="col-xlg videos-item"]/div[@node-type="video"]')
prefixUrl = response.xpath('//link[@rel="canonical"]/@href')[0].extract()
for tar in div_lsit:
item = SecbloodItem()
detailUrl = prefixUrl + tar.xpath('./div[1]/a/@href')[0].extract()
title = tar.xpath('./div[1]/a/@alt')[0].extract()
imgsmall = tar.xpath('./div[1]/a/img/@src')[0].extract()
videoTime = tar.xpath('./div[1]/a/span[@class="video-duration"]/text()')[0].extract()
numPerson = tar.xpath('./div[2]/div[2]/span[1]/text()')[0].extract()
item['detailUrl'] = detailUrl
item['title'] = title
item['imgsmall'] = imgsmall
item['videoTime'] = videoTime
item['numPerson'] = numPerson
yield scrapy.Request(detailUrl, callback=self.parse_detail, meta={'item': item})
if self.page_num <= 3:
new_url = format(self.url % self.page_num)
self.page_num += 1
yield scrapy.Request(url=new_url, callback=self.parse)
print('第一数据抓出完成')
xpath写法
xpath('//a') # 所有a标签(子孙后代)
xpath('//a[2]') # 所有a标签,按索引找第二个
xpath('//a[@id]') # 所有a标签,并且含有id属性
xpath('//a[@id="i1"]') # 所有a标签,并且属性id='i1'
xpath('//a[@href="link.html"][@id="i1"]') # 所有a标签,属性href="link.html" 而且 id="i1"
xpath('//a[contains(@href, "link")]') # 所有a标签,属性href的值包含"link"
xpath('//a[starts-with(@href, "link")]') # 所有a标签,属性href的值以"link"开头
xpath('//a[re:test(@id, "i\d+")]') # 所有a标签 属性id的值 符合正则表达式"i\d+"的规则
xpath('//a[re:test(@id, "i\d+")]/text()').extract() # 所有a标签,取text的值
xpath('//a[re:test(@id, "i\d+")]/@href').extract() # 所有a标签,取href的属性值
xpath('/html/body/ul/li/a/@href').extract() # 取所有的值
xpath('//body/ul/li/a/@href').extract_first() # 取第一个值
items.py
import scrapy
class SecbloodItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
detailUrl = scrapy.Field()
title = scrapy.Field()
imgsmall = scrapy.Field()
videoTime = scrapy.Field()
numPerson = scrapy.Field()
videoUrl = scrapy.Field()
imgBig = scrapy.Field()
pipelines.py 看数据直接打印item
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import os.path
import sqlite3
class SecbloodPipeline:
def process_item(self, item, spider):
return item
class mysqlPileLine(object):
conn = None
cur = None
def open_spider(self,spider):
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
db_path = os.path.join(BASE_DIR, "second.db")
self.conn = sqlite3.connect(db_path)
def process_item(self,item,spider):
self.cur = self.conn.cursor()
insert_sql = 'insert into secondAqd values (NULL , ?, ?, ?, ?, ?, ?, ?)'
self.cur.execute(insert_sql, (item['detailUrl'], item['title'], item['imgBig'], item['videoUrl'],
item['imgsmall'], item['videoTime'], item['numPerson']))
self.conn.commit()
return item
def close_spider(self,spider):
self.cur.close()
self.conn.close()
# class mysqlPileLine(object):
# conn = None
# cur = None
# def open_spider(self, spider):
# self.conn = pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='123456',db='course',charset='utf8')
#
# def process_item(self, item, spider):
# self.cur = self.conn.cursor()
# insert_sql = 'insert into testav values (NULL ,"%s" , "%s" , "%s" , "%s" , "%s")'%(item['detailUrl'], item['title'], item['imgsmall'], item['videoTime'], item['numPerson'])
# try:
# self.cur.execute(insert_sql)
# self.conn.commit()
# except Exception as e:
# print(e)
# self.conn.rollback()
# return item
#
# def close_spider(self, spider):
# self.cur.close()
# self.conn.close()
小结:
复制粘贴简单修改后可以直接用于网站数据的获取,数据库连接的问题
sqlite3数据库要提前创建好,表名 字段,NULL 是添加了字段 id
MySQL要开启数据库,测试连接成功后才能使用,测试中指定数据库
创建表名 字段名称