存储使用mysql,增量更新东方头条全站新闻的标题 新闻简介 发布时间 新闻的每一页的内容 以及新闻内的所有图片。项目文件结构。
这是run.py的内容
1 #coding=utf-8 2 from scrapy import cmdline 3 import redis,time,threading 4 from multiprocessing import Process 5 #import scrapy.log 6 7 #cmdline.execute("scrapy crawl baoxian -s LOG_FILE=scrapy10.log".split()) 8 9 #scrapy crawl myspider -s LOG_FILE=scrapy2.log 10 11 12 start_urls = ['http://mini.eastday.com/', 13 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0010¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=170603095010319,170603093955594-2,170603093955594&jsonpcallback=jQuery18303164258797187358_1496455837718&_=1496455838146', #国内 14 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0011¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=170603142336718-2,170603142336718,170603122752716&jsonpcallback=jQuery18307262756496202201_1496477922458&_=1496477923254', #国际 15 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0005¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18302500620267819613_1496483754044&_=1496483755277',#军事 16 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0003¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery183026658024708740413_1496480575988&_=1496480576634',#社会 17 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0002¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery1830691694314358756_1496480816841&_=1496480817500',#娱乐 18 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0019¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18303703077440150082_1496480892188&_=1496480892581',#健康 19 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0015¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery183023222095426172018_1496480961781&_=1496480962307',#时尚 20 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0008¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery183017557532875798643_1496481013410&_=1496481013824',#科技 21 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0012¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18308183211348950863_1496481106550&_=1496481106993',#汽车 22 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0018¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18309359942991286516_1496481227742&_=1496481228242',#人文 23 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0007¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery183019699203735217452_1496481313637&_=1496481314077',#游戏 24 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0020¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18307782149398699403_1496481413006&_=1496481413401',#星座 25 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0021¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18306590236281044781_1496481467020&_=1496481467496',#家居 26 27 ] 28 29 r = redis.Redis(host='127.0.0.1',port=6379,db=0) 30 31 32 33 def check_redis_requsts(): 34 while(1): 35 ''' 36 for url in start_urls: 37 r.rpush('eastdayspider:start_urls',url) 38 print u'插入到start_urls的:',r.lrange('eastdayspider:start_urls',0,-1) 39 ''' 40 for url in start_urls: 41 r.sad