Main Codes
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
count = 0
class BeautySpider(CrawlSpider):
name = 'beauty'
start_urls = ['http123://www.meinv1213.hk/']
rules = (
Rule(LinkExtractor(
allow='http://www.meinv.hk/\?p=\d+'),
callback='parse_item',
),
)
def parse_item(self, response):
item = {}
global count
item['star_name'] = response.xpath('//h1[@class="title"]/text()').get()
print(item['star_name'])
item['image_urls'] = response.xpath('//div[@class="post-content"]//img/@src').extract()
print(item['image_urls'])
count += 1
print(count)
return item
"""
提取首页中的美丽图片规则
"""
"""
2020-04-27 17:18:04 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.meinv.hk/?p=2701>
{'star_name': '嘘', 'image_urls': ['http://www.meinv.hk/wp-content/uploads/2018/02/2018020721314999.jpeg', 'http
://www.meinv.hk/wp-content/uploads/2018/02/2018020721314456.jpg']}
"""