1.导入包
2.提取数据页面
3.item.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html # import scrapy # # # class BolespiderItem(scrapy.Item): # url = scrapy.Field() # title = scrapy.Field() # time = scrapy.Field() # sort = scrapy.Field() # content = scrapy.Field() # praise = scrapy.Field() # collect = scrapy.Field() # comment = scrapy.Field() import scrapy, re from scrapy.contrib.loader import ItemLoader from datetime import datetime from scrapy.contrib.loader.processor import Join, MapCompose, TakeFirst def convert_time(value): # 此处的value是列表中一项一项的 value = value.replace('.', '').strip() try: time = datetime.strptime(value, '%Y/%m/%d') except: time = datetime.now() return time def convert_sort(value): if '评论 ' in value: return "" else: return value def convert_praise(value): # 三种情况:1.''赞; 2.'',1,赞; if value.strip() != "": pattern = re.compile(r'\d+') num = re.findall(pattern, value) if num: num = int(num[0]) else: num = 0 return num def convert_collect(value): # 用正则拿数字 num = re.findall(re.compile(r'\d+'), value) if num: num = int(num[0]) else: num = 0 return num def convert_comment(value): num = re.findall(re.compile(r'\d+'), value) if num: num = int(num[0]) else: num = 0 return num class BolespiderItem(scrapy.Item): title = scrapy.Field( output_processor=TakeFirst() ) time = scrapy.Field( input_processor=MapCompose(convert_time), output_processor=TakeFirst() ) sort = scrapy.Field( input_processor=MapCompose(convert_sort), output_processor=Join() ) content = scrapy.Field( output_processor=Join() ) praise = scrapy.Field( input_processor=MapCompose(convert_praise), output_processor=TakeFirst() ) collect = scrapy.Field( input_processor=MapCompose(convert_collect), output_processor=TakeFirst() ) comment = scrapy.Field( input_processor=MapCompose(convert_comment), output_processor=TakeFirst() ) detail_url = scrapy.Field( output_processor=Join() )