首先获取大分类Pre-K、K、1st等,然后获取小分类Art等
进入小分类后获取图书的链接
获取图书链接之后就会看到图书的下载链接,不过需要cookie登录才能看到链接的哦,同时账号的话需要注册一下,网上有注册链接,注册之后有一个月的限时免费下载权限。
最后就是实现翻页功能了,当每一页的图书信息已经获取到了之后,获取下一页的URL再进行一次循环,直到没有下一页的URL,这样图书资源就都获取到啦
以下为spiders文件代码,这个网站有一些小坑,多多尝试一下就可以解决了,我适用了随机用户代理和IP代理,下一篇博客详细说明如何在scrapy中设置随机UA和IP
import scrapy
from copy import deepcopy
from twinkl.items import TwinklItem
from twinkl.settings import COOKIES
import re
class TbookSpider(scrapy.Spider):
name = 'tbook'
allowed_domains = ['twinkl.com.sg']
start_urls = ['https://www.twinkl.com.sg/resources/usa-resources?sign_in=1', ]
def start_requests(self):
cookies = COOKIES
yield scrapy.Request(
url=self.start_urls[0],
cookies=cookies,
callback=self.parse
)
def parse(self, response):
lis = response.xpath('//ul[@class="menuTab"]/li')
for li in lis:
item = TwinklItem()
# 获取大分类
item['b_sort'] = li.xpath('./a/text()').extract_first()
item['b_sort_href'] = li.xpath('./a/@href').extract_first()
item['b_sort_href'] = 'https://www.twinkl.com.sg' + item['b_sort_href']
# 获取小分类
s_list = li.xpath('./ul/li/ul/li/a')
for s in s_list:
item['s_sort'] = s.xpath('./text()').extract_first()
item['s_sort_href'] = s.xpath('./@href').extract_first()
item['s_sort_href'] = 'https://www.twinkl.com.sg' + item['s_sort_href']
yield scrapy.Request(
url=item['s_sort_href'],
callback=self.get_book,
cookies=COOKIES,
meta={'item': deepcopy(item)}
)
def get_book(self, response):
item = response.meta.get('item')
lis = response.xpath('//ul[@id="resources"]/li')
for li in lis:
# 获取书籍详情页链接和书名
item['book_href'] = li.xpath('./a/@href').extract_first()
item['book_href'] = 'https://www.twinkl.com.sg/' + item['book_href']
item['book_name'] = li.xpath('./a/img/@alt').extract_first()
item['book_name'] = item['book_name'].split('-')[0]
yield scrapy.Request(
url=item['book_href'],
callback=self.download_book,
cookies=COOKIES,
meta={'item': deepcopy(item)}
)
# 获取下一页的URL
next_page = re.findall('<span>\.\.\.</span><a href="(.*?)" class="button last" data-page=".*?">Next</a>', response.text)
if len(next_page) != 0:
new_url = 'https://www.twinkl.com.sg/' + next_page[0]
yield scrapy.Request(
url=new_url,
callback=self.get_book,
cookies=COOKIES,
meta={'item': deepcopy(item)}
)
def download_book(self, response):
# 获取下载链接
item = response.meta.get('item')
href = response.xpath('//ul[@id="actual_downloads"]/li/span/a/@href').extract_first()
if href is not None:
item['file_urls'] = 'https://www.twinkl.com.sg' + href
item['name'] = item['b_sort'] + '&' + item['s_sort'] + '&' + item['book_name']
yield item