闲来无事,做的一个小爬虫项目
爬虫主程序:
1 import scrapy 2 from ..items import MeiziItem 3 4 class MztSpider(scrapy.Spider): 5 name = 'mzt' 6 allowed_domains = ['meizitu.com'] 7 start_urls = ['http://meizitu.com/'] 8 9 def parse(self, response): 10 tags = response.xpath(".//*[@class='tags']/span/a") 11 for i in tags: 12 item = MeiziItem() 13 tag_href = i.xpath(".//@href").extract()[0] 14 tag_name = i.xpath(".//@title").extract()[0] 15 item['tag_name'] = tag_name 16 item['tag_href'] = tag_href 17 #print(item['tag_name']) 18 #yield item 19 yield scrapy.Request(url=item['tag_href'], meta={'item': item}, callback=self.parse_page) 20 21 def parse_page(self, response): 22 23 item = response.meta['item'] 24 # 进入某个标签后,爬取底部分页按钮 25 page_lists = response.xpath(".//*[@id='wp_page_numbers']/ul/li") 26 # 获取底部分页按钮上的文字,根据文字来判断当前标签页下总共有多少分页 27 page_list = page_lists.xpath('.//text()') 28 # 如果当前标签页下有多个页面,则再根据第一个按钮是否为“首页”来进行再次提取,因为这里有的页面第一个按钮是首页,有的第一个按钮是“1” 29 if len(page_lists) > 0: 30 if page_list[0].extract() == '首页': 31 page_num = len(page_lists) - 3 32 else: 33 page_num = len(page_lists) - 2 34 else: 35 page_num = 1 36 37 # 根据当前标签页的链接,来拼成带页码的链接 38 if '_' in item['tag_href']: 39 index = item['tag_href'][::-1].index('_') 40 href_pre = item['tag_href'][:-index] 41 else: 42 if page_num == 1: 43 href_pre = item['tag_href'].split('.html')[0] 44 else: 45 href_pre = item['tag_href'].split('.html')[0] + '_' 46 for i in range(1, page_num + 1): 47 item = response.meta['item'] 48 if page_num == 1: 49 href = href_pre + '.html' 50 else: 51 href = href_pre + str(i) + '.html' 52 item['page_list'] = href 53 #yield item 54 yield scrapy.Request(url=item['page_list'], meta={'item': item}, callback=self.parse_album) 55 56 def parse_album(self, response): 57 albums = response.xpath(".//*[@class='pic']") 58 for album in albums: 59 item = response.meta['item'] 60 album_href = album.xpath(".//a/@href").extract()[0] 61 album_name = album.xpath(".//a/img/@alt").extract()[0] 62 item['album_name'] = album_name 63 item['album_href'] = album_href 64 #yield item 65 yield scrapy.Request(url=item['album_href'], meta={'item': item}, callback=self.parse_img) 66 67 def parse_img(self, response): 68 img_list = response.xpath(".//*/p/img") 69 for img in img_list: 70 item = response.meta['item'] 71 img_title = img.xpath(".//@alt").extract()[0] 72 if img_title == '': 73 for i in range(1, len(img_list + 1)): 74 img_title = item['album_name'] + '_' + str(i) 75 else: 76 img_title = img_title 77 img_urls = img.xpath(".//@src").extract() 78 img_src = img.xpath(".//@src").extract()[0] 79 item['img_title'] = img_title 80 item['img_src'] = img_src 81 item['img_urls'] = img_urls 82 yield item
items设置
1 import scrapy 2 3 4 class MeiziItem(scrapy.Item): 5 # define the fields for your item here like: 6 # name = scrapy.Field() 7 # 标签名称 8 tag_name = scrapy.Field() 9 # 标签链接 10 tag_href = scrapy.Field() 11 # 进入某标签后的所有链接,加页码的 12 page_list = scrapy.Field() 13 # 图片专辑名称 14 album_name = scrapy.Field() 15 # 图片专辑链接 16 album_href = scrapy.Field() 17 # 照片标题 18 img_title = scrapy.Field() 19 # 照片链接 20 img_src = scrapy.Field() 21 # 照片链接集合,用于ImagesPipeline下载图片 22 img_urls = scrapy.Field()
输出管道:
print('正在爬取...') print('老湿机,请耐心等待哟...') class MeiziPipeline(object): def process_item(self, item, spider): print('标签名称:',item['tag_name']) print('标签链接:',item['tag_href']) print('页码:',item['page_list']) print('图片专辑名称:',item['album_name']) print('图片专辑链接:',item['album_href']) print('照片标题:',item['img_title']) print('照片链接:',item['img_src']) print('照片链接集合:',item['img_urls']) print('----------------') return item
保存到本地的管道:
import scrapy from scrapy.pipelines.images import ImagesPipeline from scrapy.exceptions import DropItem class MztImagesPipeline(ImagesPipeline): def get_media_requests(self, item, info): for image_url in item['img_urls']: yield scrapy.Request(image_url) def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("该Item没有图片") return item
setting的设置,往setting.py加入
BOT_NAME = 'meizi' SPIDER_MODULES = ['meizi.spiders'] NEWSPIDER_MODULE = 'meizi.spiders' IMAGES_STORE = r'G:\\mzt' # 图片存储路径 IMAGES_EXPIRES = 90 # 过期天数 IMAGES_MIN_HEIGHT = 100 # 图片的最小高度 IMAGES_MIN_WIDTH = 100 # 图片的最小宽度
爬取的最终结果
本来想把这些图片分门分类的保存,然而不太会,所有的图片全保存在一个文件夹下面,,,