使用scrapy爬取新闻
#Item
class AifranItem(scrapy.Item):
# define the fields for your item here like:
img = scrapy.Field()
title = scrapy.Field()
url = scrapy.Field()
date = scrapy.Field()
create_at = scrapy.Field()
source = scrapy.Field()
pass
#spider
import json
from asyncio import sleep
import time
from telnetlib import EC
import scrapy
from scrapy import Selector, Request
from selenium import webdriver
from ..items import AifranItem, Ygmoive_info_Item, YgmoiveItem, Ygmoive_pages_Item
class IfranSpider(scrapy.Spider):
name = 'readhub_spider'
# all_province = []
start_urls = [
'https://api.readhub.cn/topic?lastCursor=&pageSize=20'
]
def __init__(self):
pass
# self.chromedriver_path = "D:\chromedriver\chromedriver.exe"
# self.browser = webdriver.Chrome(self.chromedriver_path)
def closed(self, spider):
# self.browser.close()
print('spider closed')
def parse(self, response):
# print(response.text)
data_list = json.loads(response.text)
print('===========')
# print(data_list)
# print(data_list['data'])
for i in data_list['data']:
# print(i)
# print(i['id'])
item = AifranItem()
item['img'] = ''
item['title'] = i['newsArray'][0]['title']
item['url'] = 'https://readhub.cn/topic/'+i['id']
item['date'] = time.strftime("%Y-%m-%d", time.localtime())
item['create_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
item['source'] = 'readhub'
yield item
print('===========')
pass
pass
把爬虫运行起来然后在shell命令运行,设置定时每隔三十分钟爬一次