爬虫scrapy——网站开发热身中篇完结

最新推荐文章于 2022-06-28 15:22:25 发布

cj1064789374

最新推荐文章于 2022-06-28 15:22:25 发布

阅读量226

点赞数

分类专栏： 2020年研究生学习笔记

本文链接：https://blog.csdn.net/cj1064789374/article/details/108532865

版权

2020年研究生学习笔记专栏收录该内容

28 篇文章 6 订阅

订阅专栏

#main.py放在scrapy.cfg同级下运行即可，与在控制台执行等效
import os
os.system('scrapy crawl books -o books.csv')

#第一讲：入门
import scrapy
class BooksSpider(scrapy.Spider):
    name = 'books' #本爬虫唯一标识
    start_urls = ['http://books.toscrape.com/'] #爬取页面
    def parse(self,response): #下载完后自动执行，提取数据及产生对下一个页面链接的下载请求
        for book in response.css('article.product_pod'):
            name = book.xpath('./h3/a/@title').extract_first()#使用XPath写法
            price = book.css('p.price_color::text').extract_first()#使用CSS选择器写法
            yield{
                'name': name,
                'price': price,
            }
        next_url = response.css('ul.pager li.next a::attr(href)').extract_first()
        if next_url:
            next_url = response.urljoin(next_url)
            yield scrapy.Request(next_url, callback=self.parse)

#第二讲：改start_urls为函数
import scrapy
class BooksSpider(scrapy.Spider):
    name = 'books'                                                  #本爬虫唯一标识
    def start_requests(self):
        yield scrapy.Request('http://books.toscrape.com/',
                            callback=self.parse,
                            headers={'User-Agent': 'Mozilla/5.0'},
                            dont_filter=True)
    def parse(self,response):                                       #下载完后自动执行，提取数据及产生对下一个页面链接的下载请求
        for book in response.css('article.product_pod'):
            name = book.xpath('./h3/a/@title').extract_first()      #使用XPath写法
            price = book.css('p.price_color::text').extract_first() #使用CSS选择器写法
            yield{
                'name': name,
                'price': price,
            }
        next_url = response.css('ul.pager li.next a::attr(href)').extract_first()
        if next_url:
            next_url = response.urljoin(next_url)
            yield scrapy.Request(next_url, callback=self.parse)      #产生请求

#第三讲：xml与css选择器
import scrapy
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
text = '''
<html>
    <head>
        <base href='http://example.com/' />
        <title>example website</title>
    <body>
        <ul>
            <li>C++</li>
            <li>JAVA++</li>
            <li>python</li>
        </ul>
        <div id='images'>hello<a href='1314'>CJ<img src='img1.jpg'></a><a href='520'>GJL<img src='img2.jpg'></a>world</div>
        <ul>
            <li>aaa</li>
            <li>bbb</li>
            <li>ccc</li>
        </ul>
        <h1>hello</h1>
        <h1>world</h1>
        <b>CJ GJL</b>
    </body>
</html>
'''
#一、生成selector的两种方法（str与HtmlResponse）
response=Selector(response=HtmlResponse(url='http://www.example.com',body=text,encoding='utf8'))
print(response)
selector=Selector(text=text)
print(selector)
    # <Selector xpath=None data='<html>\n    <head>\n        <base href=...'>
    # <Selector xpath=None data='<html>\n    <head>\n        <base href=...'>
print('--------------------------')

#二、详解xml与xPath
#1、xml文档节点类型：根节点，元素节点，属性节点，文本节点
#2、xml节点间关系：父子，兄弟，祖先，子孙
#3、xPath基本语法：
# /         表示根节点或连接儿子，注意html是根节点的儿子，即/html
# EMEMENT   当前节点的名为ELEMENT的子元素节点
# @ATTR     当前节点的名为ATTR的子属性节点
# text()    当前节点的子文本节点
print(response.xpath('/html'))                            #从根开始的绝对路径，获得html元素节点
print(response.xpath('/html/body/div/a').extract())       #从根开始的绝对路径，获得a元素节点
print(response.xpath('/html/body/div/a/text()').extract())#从根开始的绝对路径，获得a元素节点的文本节点
print(response.xpath('/html/body/div/a/@href').extract()) #从根开始的绝对路径，获得a元素节点的href属性节点
    # [<Selector xpath='/html' data='<html>\n    <head>\n        <base href=...'>]
    # ['<a href="1314">CJ<img src="img1.jpg"></a>', '<a href="520">GJL<img src="img2.jpg"></a>']
    # ['CJ', 'GJL']
    # ['1314', '520']
# //        表示所有子孙节点
print(response.xpath('//div//img').extract())             #选中文档中所有div元素节点的所有img元素节点
print(response.xpath('//div//text()').extract())          #选中文档中所有div元素节点的所有文本节点
print(response.xpath('//div//@href').extract())           #选中文档中所有div元素节点的所有href属性节点
    # ['<img src="img1.jpg">', '<img src="img2.jpg">']
    # ['hello', 'CJ', 'GJL', 'world']
    # ['1314', '520']
# *         当前元素节点所有子元素节点
# @*        当前元素节点所有子属性节点
# text()    当前元素节点所有子文本节点
print(response.xpath('//div/*').extract())                 #选中文档中所有div元素节点的所有子元素节点
print(response.xpath('//div/@*').extract())                #选中文档中所有div元素节点的所有子属性节点
print(response.xpath('//div/text()').extract())            #选中文档中所有div元素节点的子文本节点
    # ['<a href="1314">CJ<img src="img1.jpg"></a>', '<a href="520">GJL<img src="img2.jpg"></a>']
    # ['images']
    # ['hello', 'world']
# .         当前节点
# ..        当前节点父亲
print(response.xpath('//div').xpath('./text()').extract()) #第二次xpath以前当前选择器路径开始故用./表示相对路径
print(response.xpath('//img').xpath('..').extract())       #第二次xpath以前当前选择器路径开始故用..表示父亲节点
print(response.xpath('//img/..').extract())                #作用同上一行
    # ['hello', 'world']
    # ['<a href="1314">CJ<img src="img1.jpg"></a>', '<a href="520">GJL<img src="img2.jpg"></a>']
    # ['<a href="1314">CJ<img src="img1.jpg"></a>', '<a href="520">GJL<img src="img2.jpg"></a>']
# [谓语]      附加条件
print(response.xpath('//a[2]').extract())               #a元素节点第2个（从1开始计）
print(response.xpath('//a[last()]').extract())          #a元素节点最后1个
print(response.xpath('//a[position()<=1]').extract())   #a元素节点前1个
print(response.xpath('//div[@id]').extract())           #有id属性的div元素节点
print(response.xpath('//div[@id="images"]').extract())  #有id属性值为images的div元素节点
    # ['<a href="520">GJL<img src="img2.jpg"></a>']
    # ['<a href="520">GJL<img src="img2.jpg"></a>']
    # ['<a href="1314">CJ<img src="img1.jpg"></a>']
    # ['<div id="images">hello<a href="1314">CJ<img src="img1.jpg"></a><a href="520">GJL<img src="img2.jpg"></a>world</div>']
    # ['<div id="images">hello<a href="1314">CJ<img src="img1.jpg"></a><a href="520">GJL<img src="img2.jpg"></a>world</div>']
print('--------------------------')

#三、提取数据的两种方法（extract_first与遍历，其实就是字典的val组成List然后取List中的各个str元素）
print(selector.xpath('//li/text()'))                           #1、这只是生成一个元素为Selector的list
print(selector.xpath('//li/text()').extract())                 #2、要提取data的值，才生成一个元素为str的list
print(selector.xpath('//b').extract_first())                   #3、当list只有一个元素时，常用extract_first
for sel in selector.xpath('//h1'):print(sel.xpath('./text()')) #4、当list有多个元素时，用for遍历访问
    # [<Selector xpath='//li/text()' data='C++'>, <Selector xpath='//li/text()' data='JAVA++'>, <Selector xpath='//li/text()' data='python'>]
    # ['C++', 'JAVA++', 'python']
    # <b>CJ GJL</b>
    # [<Selector xpath='./text()' data='hello'>]
    # [<Selector xpath='./text()' data='world'>]
print('--------------------------')

#四、CSS选择器
print(selector.css('img').extract())                            #所有img
print(selector.css('base,title').extract())                     #所有base与title
print(selector.css('body img').extract())                       #所有body的子孙中的img
print(selector.css('body>div').extract())                       #所有body的儿子中的div
print(selector.css('[id]').extract())                           #所有有id属性的元素节点
print(selector.css('[id=images]').extract())                    #所有有id属性且等于images的元素节点
print(selector.css('ul:nth-child(3)>li:nth-child(2)').extract())#所有作为其父亲的第三个儿子的ul的第二个儿子（叫li）
print(selector.css('ul:first-child>li:last-child').extract())   #所有作为其父亲的第一个儿子的ul的最后一个儿子（叫li）
print(selector.css('a::text').extract())                        #所有a元素节点中的文本
    # ['<img src="img1.jpg">', '<img src="img2.jpg">']
    # ['<base href="http://example.com/">', '<title>example website</title>']
    # ['<img src="img1.jpg">', '<img src="img2.jpg">']
    # ['<div id="images">hello<a href="1314">CJ<img src="img1.jpg"></a><a href="520">GJL<img src="img2.jpg"></a>world</div>']
    # ['<div id="images">hello<a href="1314">CJ<img src="img1.jpg"></a><a href="520">GJL<img src="img2.jpg"></a>world</div>']
    # ['<div id="images">hello<a href="1314">CJ<img src="img1.jpg"></a><a href="520">GJL<img src="img2.jpg"></a>world</div>']
    # ['<li>bbb</li>']
    # ['<li>python</li>']
    # ['CJ', 'GJL']

补充说明一下：第一讲中的下一页链接的CSS写法是'ul.pager li.next a::attr(href)'
源html是<ul class="pager"><li class="next"><a href='catalogue/page-2.html">next</a></li></ul>
也就是说，css中a.b指有class属性为b的元素节点a，a::attr(b)指元素节点a的b属性的值，若取文本直接a::text

#第四讲：Items
#items.py
from scrapy import Item,Field
class BookItem(Item):
    name=Field()
    price=Field()
#book_spider.py
import scrapy
from ..items import BookItem
class BooksSpider(scrapy.Spider):
    name = 'books' #本爬虫唯一标识
    start_urls = ['http://books.toscrape.com/'] #爬取页面
    def parse(self,response): #下载完后自动执行，提取数据及产生对下一个页面链接的下载请求
        for sel in response.css('article.product_pod'):
            book=BookItem()
            book['name']=sel.xpath('./h3/a/@title').extract_first()
            book['price']=sel.css('p.price_color::text').extract_first()
            yield book
        next_url = response.css('ul.pager li.next a::attr(href)').extract_first()
        if next_url:
            next_url = response.urljoin(next_url)
            yield scrapy.Request(next_url, callback=self.parse)

#第五讲：写入文件时转换及查重（与上一讲的两个文件结合，总共5个文件已改写了4个）
#settings.py
ITEM_PIPELINES = {
   'example.pipelines.PriceConverterPipiline': 300,
   'example.pipelines.DuplicatesPipiline': 350,
}
#pipelines.py
from scrapy.exceptions import DropItem
class DuplicatesPipiline(object):
    def __init__(self):
        self.book_set = set()               #初始化用于书名去重集合
    def process_item(self,item,spider):
        name = item['name']
        if name in self.book_set:
            raise DropItem('Duplicate book found: %s' % item)#抛出异常就不会再写入文件
        self.book_set.add(name)
        return item
class PriceConverterPipiline(object):
    exchange_rate = 8.5309
    def process_item(self,item,spider):                     #传入Item或dict
        price=float(item['price'][1:])*self.exchange_rate   #去掉前面英磅符号转浮点数再乘汇率
        item['price']='RMB %.2f' % price                    #重新赋值为人民币符号加保留2位小数的浮点数
        return item                                         #传出item或dict

#第六讲：提取链接（个人感觉作用不大）
#用Selector是可以的，下面的例子是使用LinkExtractor，其实差不多
        from scrapy.linkextractors import LinkExtractor
        links = LinkExtractor(restrict_css='ul.pager li.next').extract_links(response)
        if links:
            next_url = links[0].url
            yield scrapy.Request(next_url,callback=self.parse)
'''
        next_url = response.css('ul.pager li.next a::attr(href)').extract_first()
        if next_url:
            next_url = response.urljoin(next_url)
            yield scrapy.Request(next_url, callback=self.parse)
'''

#第七讲：自定义导出格式（scrapy crawl books -o books.csv中的csv格式其实够用）
#my_exporters.py（与settings.py同级）
from scrapy.exporters import BaseItemExporter
import xlwt
class ExcelItemExporter(BaseItemExporter):
    def __init__(self, file, **kwargs):
        self.configure(kwargs)
        self.file = file
        self.wbook = xlwt.Workbook()
        self.wsheet = self.wbook.add_sheet('scrapy')
        self.row = 0
    def finish_exporting(self):
        self.wbook.save(self.file)
    def export_item(self, item):
        fields = self.__get_serialized_fields(item)
        for col, v in enumerate(x for _, x in fields):
            self.wsheet.write(self.row, col, v)
        self.row += 1
#settings.py
FEED_EXPORTERS = {
   'excel': 'example.my_exporters.ExcelItemExporter'
}

#第八讲：分析页面
scrapy shell http://matplotlib.org/examples/index.html

In [1]: view(response)
Out[1]: True

In [2]: from scrapy.linkextractors import LinkExtractor

In [3]: le=LinkExtractor(restrict_css='div.toctree-wrapper.compound li.toctree-l2')

In [4]: links=le.extract_links(response)

In [5]: [link.url for link in links]
Out[5]: 
['https://matplotlib.org/examples/animation/animate_decay.html',
 'https://matplotlib.org/examples/animation/basic_example.html',
 'https://matplotlib.org/examples/animation/basic_example_writer.html',
……
 'https://matplotlib.org/examples/widgets/slider_demo.html',
 'https://matplotlib.org/examples/widgets/span_selector.html']

In [6]: len(links)
Out[6]: 506

In [7]: fetch('https://matplotlib.org/examples/animation/animate_decay.html')
2020-09-13 10:57:26 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://matplotlib.org/examples/animation/animate_decay.html> (referer: None)

In [8]: view(response)
Out[8]: True

In [9]: href=response.css('a.reference.external::attr(href)').extract_first()

In [10]: href
Out[10]: 'animate_decay.py'

In [11]: response.urljoin(href)
Out[11]: 'https://matplotlib.org/examples/animation/animate_decay.py'

#第九讲：爬取图片（爬取失败，原因未知）
scrapy startproject so_image
cd so_image
scrapy genspider images image.so.com

#images.py
import scrapy
from scrapy import Request
import json
class ImagesSpider(scrapy.Spider):
    BASE_URL='http://image.so.com/zj?ch=art&sn=%s&listtype=new&temp=1'
    start_index=0
    MAX_DOWNLOAD_NUM=1000
    name = 'images'
    start_urls = [BASE_URL % 0]
    def parse(self, response):
        infos=json.loads(response.body.decode('utf-8'))
        yield {'image_urls':[info['qhimg_url'] for info in infos['list']]}
        self.start_index+=infos['count']
        if infos['count']>0 and self.start_index<self.MAX_DOWNLOAD_NUM:
            yield Request(self.BASE_URL % self.start_index)

#settings.py
ITEM_PIPELINES = {
   'scrapy.pipelines.images.ImagesPipeline': 300,
}
IMAGES_STORE='download_images'

scrapy crawl images

#第十讲：爬取文件（爬取失败，原因未知）
创建项目
scrapy startproject matplotlib_examples

进入目录
cd matplotlib_examples

定义爬虫名字及要爬的域名
scrapy genspider examples matplotlib.org

***下面依次修改items,examples,middlewares,pipelines,settings五个py文件***

首先，改items.py，确定要爬的数据内容，后面examples.py要导入使用
import scrapy
class ExampleItem(scrapy.Item):
    file_urls=scrapy.Field()
    files=scrapy.Field()

再者，改examples.py实现爬取
import scrapy
from scrapy.linkextractors import LinkExtractor
from ..items import ExampleItem
class ExamplesSpider(scrapy.Spider):
    name = 'examples'
    allowed_domains = ['matplotlib.org']
    start_urls = ['http://matplotlib.org/examples/index.html']
    def parse(self, response):
        le=LinkExtractor(restrict_css='div.toctree-wrapper.compound',deny='/index.html$')
        print(len(le.extract_links(response)))
        for link in le.extract_links(response):
            yield scrapy.Request(link.url,callback=self.parse_example)
    def parse_example(self,response):
        href=response.css('a.reference.external::attr(href)').extract_first()
        url=response.urljoin(href)
        example=ExampleItem()
        example['file_urls']=[url]
        return example

然后，不用改middlewares.py，除非用到代理服务器

接着，改piplelines.py，自定义存储函数（主要功能是重命名下载的文件名）
from scrapy.pipelines.files import FilesPipeline
from urllib.parse import urlparse
from os.path import basename,dirname,join
class MyFilesPipeline(FilesPipeline):
    def file_path(self,request,response=None,info=None):
        path=urlparse(request.url).path
        return join(basename(dirname(path)),basename(path))

最后，改settings.py，前四行是自动生成的，就改ITEM_PIPELINES字典即可
BOT_NAME = 'matplotlib_examples'
SPIDER_MODULES = ['matplotlib_examples.spiders']
NEWSPIDER_MODULE = 'matplotlib_examples.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
   'matplotlib_examples.pipelines.MyFilesPipeline': 1,
}
FILES_STORE='examples_src'

#对了，本项目是我认真对照过书本，但是爬取失败了不知为何，但代码理解了就好

关于scrapy的使用思路总结：
·在第一章中，只改动examples.py这个主要实现爬虫的文件就可以爬了，middlewares.py默认就是不动的
·后面改items是为了明确要爬的信息还有让pipelines.py的代码可以操作这个数据实现自定义的下载方法
·而如果要用自pipelines.py中自己写的函数，settings.py中就要改动ITEM_PIPELINES字典进行显式声明

关于未总结的三个章节内容：
·关于js动态网页的爬取，安装js渲染引擎unsplash时那个pywin32版本有冲突，我怕我的Kivy跑不了所以没有装，用到再研究，好像疯狂讲义里有讲到模拟浏览器运行来爬取！
·关于代理服务器的爬取，我没有代理服务器，暂时跳过
·关于redis分布式爬取，我没有装redis，也没有多台电脑，暂时跳过