scrapy框架使用
scrapy的创建以及使用
scapy startproject 项目名
cd 项目名
scrapy genspider 爬虫名 域名(爬取的地址)
scrapy crawl 爬虫名 (运行)
举例以电影天堂为例
爬虫编写
import scrapy
from scrapy05_movie.items import Scrapy05MovieItem
class MvSpider(scrapy.Spider):
name = 'mv'
allowed_domains = ['www.ygdy8.net']
start_urls = ['https://www.ygdy8.net/html/gndy/china/index.html']
base_url = 'https://www.ygdy8.net/html/gndy/china/list_4_'
page = 1
def parse(self, response):
table_list = response.xpath("//div[@class='co_content8']/ul//table")
for a in table_list:
name = a.xpath(".//td[2]//a[2]/text()").extract_first()
src = a.xpath(".//td[2]//a[2]/@href").extract_first()
time = a.xpath(".//td[2]/font/text()").extract_first()
url = "https://www.ygdy8.net" + src
yield scrapy.Request(url=url, callback=self.parse_second, meta={'name': name, 'time': time, 'url': url})
if self.page < 146:
self.page = self.page + 1
url = self.base_url + str(self.page) + '.html'
yield scrapy.Request(url=url, callback=self.parse)
#爬取详情页的图片地址
def parse_second(self, response):
img_src = response.xpath("//div[@id='Zoom']//img/@src").extract_first()
name = response.meta['name']
time = response.meta['time']
url = response.meta['url']
movie = Scrapy05MovieItem(src=img_src, name=name, time=time, url=url)
yield movie
设置参数,在items文件设置
import scrapy
class Scrapy05MovieItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
time = scrapy.Field()
url = scrapy.Field()
src = scrapy.Field()
保存数据到json文件 并下载数据到文件夹,首先开启管道 在settings文件
第一个为自带的管道需要开启,第二个为自己写的下载管道
ITEM_PIPELINES = {
'scrapy05_movie.pipelines.Scrapy05MoviePipeline': 300,
'scrapy05_movie.pipelines.MoviedownloadPipeline': 301,
}
在pipelines文件进行打开关闭需保存的文件并下载数据到文件夹
# 设置一个json文件 打开和关闭各写一个函数,避免资源浪费
class Scrapy05MoviePipeline:
# 打开爬虫
def open_spider(self, spider):
self.fp = open('movie.json', 'w', encoding='utf-8')
# 写入数据到json文件
def process_item(self, item, spider):
self.fp.write(str(item))
return item
#关闭爬虫
def close_spider(self, spider):
self.fp.close()
import urllib.request
class MoviedownloadPipeline:
# 通过urllib.request.urlretrieve函数下载数据,此时json文件和下载目标文件夹需在同一级目录下
def process_item(self, item, spider):
url = item.get('src')
filename = './movie/' + item.get('name') + '.jpg'
urllib.request.urlretrieve(url=url, filename=filename)
return item
将爬虫保存数据库,以scapy_crawlspider抓取读书网为例
区别在于创建爬虫时使用 scrapy genspider -t crawl 爬虫名 域名
在settings文件设置关键参数,再如前面打开管道设置管道
DB_HOST = '127.0.0.1' # 本机ip,可链接别的主机ip
BD_PORT = 3306 # 端口号
DB_USER = 'root' # 数据库用户名
DB_PASSWORD = '123456' # 数据库密码
DB_NAME = 'spider01' # 数据库名称
DB_CHARSET = 'utf8' # 类型
ITEM_PIPELINES = {
'scrapy06_book.pipelines.Scrapy06BookPipeline': 300,
'scrapy06_book.pipelines.MysqlPipeline': 301
}
写程序
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy06_book.items import Scrapy06BookItem
class BkSpider(CrawlSpider):
name = 'bk'
allowed_domains = ['www.dushu.com']
start_urls = ['https://www.dushu.com/book/1188_1.html']
rules = (
Rule(LinkExtractor(allow=r'/book/1188_\d+\.html'),
callback='parse_item',
follow=False),
)
def parse_item(self, response):
li_list = response.xpath("//div[@class='bookslist']//li")
for li in li_list:
name = li.xpath(".//img/@alt").extract_first()
src = li.xpath(".//img/@data-original").extract_first()
author = li.xpath(".//p[1]/text()").extract_first()
url = li.xpath(".//a/@href").extract_first()
url = "https://www.dushu.com" + url
book = Scrapy06BookItem(name=name, src=src, author=author, url=url)
yield book
在已安装MySQL数据库的前提下下载pymysql模块,在下载pymysql模块时 pymysql必须是下载在python解释器下的如图路径里里,否则报错 “no module named pymysql ”
在pipelines文件夹设置数据库
class Scrapy06BookPipeline:
def open_spider(self, spider):
self.fp = open('book.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
self.fp.write(str(item))
return item
def close_spider(self, spider):
self.fp.close()
import pymysql
from scrapy.utils.project import get_project_settings # 引用settings文件中数据库参数
# 定义数据库类
class MysqlPipeline:
def open_spider(self, spider):
# 引用数据库参数加以调用
settings = get_project_settings()
self.host = settings['DB_HOST']
self.port = settings['BD_PORT']
self.user = settings['DB_USER']
self.password = settings['DB_PASSWORD']
self.name = settings['DB_NAME']
self.charset = settings['DB_CHARSET']
self.connect()
# 链接数据库
def connect(self):
self.coon = pymysql.connect(
host=self.host,
port=self.port,
user=self.user,
password=self.password,
db=self.name,
charset=self.charset
)
self.cursor = self.coon.cursor()
# 使用MySQL语句将数据写入数据库
def process_item(self, item, spider):
sql = 'insert into book(name,src,author,url) values("{}","{}","{}","{}")'.format(item['name'], item['src'],
item['author'], item['url'])
self.cursor.execute(sql)
self.coon.commit()
return item
# 关闭数据库
def close_spider(self, spider):
self.cursor.close()
self.coon.close()
注意事项
allowed_domains = ['www.dushu.com']
start_urls = ['https://www.dushu.com/book/1188_1.html'
网址去掉斜杠/ 不然报错
当写爬虫抓取数据,获取的是空列表时,先在parse函数随便打印一行符号,如果能运行并打印出来,则网站能请求成功,需要查看爬虫所写的xpath,如果使用xpath_helper插件在网页能获取数据,但是爬虫无法获取,则是xpath的问题需要自己慢慢的调试xpath