获取金十数据网页,更新数据,增量获取其内容。
环境:pycharm,mongoab,win7,python3.7
链接:https://pan.baidu.com/s/1MpZM8XJoCxBlmsSzg2WY3w
提取码:muqe
增量获取,更新数据,有和数据中不一样的保存,一样的直接返回None,不使用数据库。
import pymongo
from pymysql import connect
from scrapy.exceptions import DropItem
from scrapy.conf import settings
class MongoPipeline(object):
def open_spider(self,spider):
host = settings['MONGODB_HOST']
port = settings['MONGODB_PORT']
dbName = settings['MONGODB_DBNAME']
self.client = pymongo.MongoClient(host=host, port=port)
tdb = self.client[dbName]
self.post = tdb[settings['MONGODB_DOCNAME']]
def process_item(self, item, spider):
if not self.post.find_one({'date_id': item['date_id']}):
self.post.insert(item)
return item
def close_spider(self,spider):
self.client.close()
解析网页数据
import scrapy
from selenium import webdriver
from scrapy import signals
import re
class JinshiSpider(scrapy.Spider):
name = 'jinshi'
allowed_domains = ['jinshi.com']
start_urls = ['https://www.jin10.com/']
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(JinshiSpider, cls).from_crawler(crawler, *args, **kwargs)
options = webdriver.ChromeOptions()
options.add_argument('--headless')
spider.chrome = webdriver.Chrome(chrome_options=options)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_closed(self, spider):
spider.chrome.quit()
print('一次爬取结束-----等待下次循环爬取')
def parse(self, response):
contents = response.xpath('//div[@class="jin-flash"]/div[@id="J_flashList"]')
for content in contents:
dates = content.xpath('//div[@class="jin-flash"]/div[@id="J_flashList"]//div/@data-id').extract()
infos = content.xpath('//div[@class="jin-flash"]/div[@id="J_flashList"]//div[@class="jin-flash_b"]').extract()
date_ids = re.findall(r'<div id="(\d+)" data-id=".*?" class=".*?">', response.text, re.DOTALL)
for date_id,date,info in zip(date_ids,dates,infos):
news = {'info':info}
for neirong in news.values():
base_url = 'https:'
wenzhanglianjie = '阅读更多'
wenzi = re.sub(r'<.*?>', '',neirong).strip().strip('\u200b')
wenzi_text = wenzi.replace('\n', '').replace('\t\t\t\t\t\t\t\t\t\t\t\t\t',' ').replace('\t\t\t\t\t\t\t\t\t\t\t',' ').replace('\t\t\t\t\t\t',' ').replace('\t\t\t\t\t\t\t\t',' ').replace('\t\t','')
gengduo = re.compile('<a href="(.*?)" target="_blank" class="jin-flash_text-more1">阅读更多<i class="jin-icon jin-icon_rightArrow"></i></a>')
gengduo_url = ''.join(gengduo.findall(neirong))
wenzi_href = re.compile('<h4 class="jin-flash_data-title"><a href="(.*?)" target="_blank">.*?</a></h4>')
wenzi_href_url = base_url+''.join(wenzi_href.findall(neirong))
photo = re.compile('<img class="J_lazyImg" data-original="(.*?)" src=".*?">')
photo_url = base_url+''.join(photo.findall(neirong))
yield {'date_id':date_id,'date': date, 'wenzi_text': wenzi_text,'gengduo_url':gengduo_url,'wenzi_href_url':wenzi_href_url,'photo_url':photo_url}
爬虫30秒,启动一次,30秒访问一次网页,找出网页变化
import os
import time
if __name__ == '__main__':
# os.system('pwd')
while True:
os.system("scrapy crawl jinshi")
# 每30秒执行一次
time.sleep(30)
解决访问者提问的问题,一个网站,加载数据怎么socket.io怎么爬取,可以爬取相应的对接接口