# -*- coding: utf-8 -*- ''' @author Mrpora ''' import json import os import sys from ipindex_spider_py.utils.MongoHandler import MongoHandler from ipindex_spider_py.items.Ip.baseCaAuthorItem import BaseCaAuthorItem from ipindex_spider_py.items.Ip.baseCaCommentItem import BaseCaCommentItem from ipindex_spider_py.items.Ip.baseCaDataItem import BaseCaDataItem from ipindex_spider_py.items.Ip.baseCaTopItem import BaseCaTopItem from ipindex_spider_py.utils import dateUtil # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html class FilePipeline(object): def __init__(self): reload(sys) sys.setdefaultencoding('utf8') # table:fileName fileNames = { 'caAuthor':'data/ip_top2/caAuthor' + str(dateUtil.getStrFormatTime('%Y%m')) + '.json', 'caComment':'data/ip_top2/caComment' + str(dateUtil.getStrFormatTime('%Y%m')) + '.json', 'caData':'data/ip_top2/caData' + str(dateUtil.getStrFormatTime('%Y%m')) + '.json', 'caTop':'data/ip_top2/caTop' + str(dateUtil.getStrFormatTime('%Y%m')) + '.json', } for fileName in fileNames: parent_path = os.path.abspath(os.path.dirname(fileNames[fileName])) if not os.path.exists(parent_path): os.makedirs(parent_path) self.file_caAuthor = open(fileNames['caAuthor'],'wb') self.file_caComment = open(fileNames['caComment'], 'wb') self.file_caData = open(fileNames['caData'], 'wb') self.file_caTop = open(fileNames['caTop'], 'wb') pass def process_item(self, item, spider): if isinstance(item, BaseCaAuthorItem): line = json.dumps(dict(item), ensure_ascii=False) + '\n' self.file_caAuthor.write(line) elif isinstance(item, BaseCaCommentItem): line = json.dumps(dict(item), ensure_ascii=False) + '\n' self.file_caComment.write(line) elif isinstance(item, BaseCaDataItem): line = json.dumps(dict(item), ensure_ascii=False) + '\n' self.file_caData.write(line) elif isinstance(item, BaseCaTopItem): line = json.dumps(dict(item), ensure_ascii=False) + '\n' self.file_caTop.write(line) return item pass def open_spider(self, spider): pass def close_spider(self, spider): self.file_caAuthor.close() self.file_caComment.close() self.file_caData.close() self.file_caTop.close() pass class MongoDBPipeline(object): def __init__(self, host, port, user, password, db): self.mongo_host = host self.mongo_port = port self.mongo_user = user self.mongo_password = password self.mongo_db = db @classmethod def from_crawler(cls, crawler): return cls( host=crawler.settings.get('MONGODB_CONFIGURATION')['mongo_host'], port=crawler.settings.get('MONGODB_CONFIGURATION')['mongo_port'], user=crawler.settings.get('MONGODB_CONFIGURATION')['mongo_user'], password=crawler.settings.get('MONGODB_CONFIGURATION')['mongo_password'], db=crawler.settings.get('MONGODB_CONFIGURATION')['mongo_db'] ) def open_spider(self, spider): self.mongoHandler = MongoHandler(host=self.mongo_host, port=self.mongo_port, db=self.mongo_db, user=self.mongo_user, password=self.mongo_password) pass def close_spider(self, spider): self.mongoHandler.close() pass def process_item(self, item, spider): if isinstance(item, BaseCaTopItem): self.mongoHandler.save_item('tmp_comicTop', dict(item)) elif isinstance(item, BaseCaAuthorItem): self.mongoHandler.save_item('tmp_comicAuthor', dict(item)) elif isinstance(item, BaseCaDataItem): self.mongoHandler.save_item('tmp_comicData', dict(item)) elif isinstance(item, BaseCaCommentItem): self.mongoHandler.save_item('tmp_comicComment', dict(item))
return item
# -*- coding: utf-8 -*- import os if __name__ == '__main__': # os.system("scrapy crawl acqqTopSpider") # os.system("scrapy crawl youyaoqiTopSpider") # os.system("scrapy crawl youyaoqiDataSpider") #os.system("scrapy crawl youyaoqiAuthorSpider") os.system("scrapy crawl youyaoqiCommentSpider") # os.system("scrapy crawl dajiaochongTopSpider")
# -*- coding: utf-8 -*- import scrapy import datetime import re from ipindex_spider_py.items.Ip.youyaoqi.youyaoqiCommentItem import YouyaoqiCommentItem from ipindex_spider_py.utils import dateUtil, commonUtil class YouyaoqiCommentSpider(scrapy.Spider): name = "youyaoqiCommentSpider" start_urls = [] for i in 1, 2: for j in 1, 2: start_urls.append('http://comic.u17.com/rank/t2.html?group='+str(i)+'&page=' +str(j)) custom_settings = { 'DEFAULT_REQUEST_HEADERS': { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36', 'Referer': 'http://comic.u17.com/'}, 'ITEM_PIPELINES': {'ipindex_spider_py.pipelines.FilePipeline': 300}, 'FILE_NAME': 'data/ip_top2/youyaoqiTop' + str(dateUtil.getStrFormatTime('%Y%m')) + '.json' } def parse(self, response): papers = response.xpath("//div[@class='rank_more_box']/ul/li") for paper in papers: rank= paper.xpath(".//i/text()").extract()[0] if (int(rank)> 30): continue item = YouyaoqiCommentItem() item['platform'] = "youyaoqi" item['spiderTime'] = str(datetime.datetime.now()) link = paper.xpath(".//div[1]/div[@class='categray']/a/@href").extract()[0] item['oid'] = commonUtil.MD5Util(link) authorlink= paper.xpath(".//div[2]/a/@href").extract()[0] item['aid'] = commonUtil.MD5Util(authorlink) item['dataVersion'] = 'V'+str(datetime.datetime.now().strftime("%Y%m")) yield scrapy.Request(link, callback=self.parse_authorInfo, meta={'item': item}) def parse_authorInfo(self, response): item = response.meta['item'] comic=re.findall(r"comic_id=(\d*)",response.text,re.I) thread=re.findall(r"thread_id:(\d*)",response.text,re.I) url="http://www.u17.com/comment/ajax.php?mod=thread&act=get_comment_php_v4&_=1525857397287&sort=create_time&thread_id="+''.join(thread)+"&object_type=custom&object_id=0&page=1&page_size=20&face=small&comic_id="+"".join(comic) yield scrapy.Request(url, callback=self.parse_nextauthorInfo, meta={'item': item}) def parse_nextauthorInfo(self, response): papers=response.xpath("//body/div") flag=True for paper in papers: item = response.meta['item'] d1 = datetime.datetime.now() - datetime.timedelta(hours=48) dt = paper.xpath(".//dt/i/@title").extract()[0] dd = datetime.datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") if(d1<=dd): flag=True; item['datetime']=dt content=paper.xpath(".//div[@class='ncc_content_right_text']/text()").extract()[0] item['content']=content yield item else: flag=False if(flag): i=int(''.join(re.findall(r"page=(\d*)",response.url,re.I)))+1 par="page="+str(i)+"" url=re.sub(r"page=(\d*)",par,response.url) par='''"total_page":(\d*)''' type(response.text) last=int(''.join(re.findall(par,response.text,re.I))) if(i<=last): yield scrapy.Request(url.encode("utf-8"), callback=self.parse_nextauthorInfo, meta={'item': response.meta['item']})