import sys
reload(sys)
sys.setdefaultencoding('utf8')
import copy
import json
import time,datetime
import re
from pyspider.libs.base_handler import *
from pyquery import PyQuery as pq
result_template = {
"info_id":"", # 资讯信息编号(自增)
"url":"", # 原文URL
"title":"", # 标题
"subheading":"", # 副标题
"fetch_time":"",
"pub_time":"", # 发布时间 文章内容中的发布时间,并非爬虫爬去到文章的时间
"sort":"", # 分类接口 ?
"summary":"", # 资讯信息摘要
"content":"", #正文
"persons":"", # 涉及到的人
"companys":"", # 涉及到的公司
"stocknames":"", # 涉及到的股票
"stockcodes":"", # 涉及到的股票代码
"industries":"", # 涉及的行业
"sections":"", # 涉及的板块
"others":"",
"info_type":"", # 文章所属类型 公告 / 新闻
"source":"", # 发布单位
"info_channel":"", # 2级标题/频道及以下所有标题/频道。不同频道之间,使用下划线"_"连接,不包含"首页"及"正文"。
"editor":"", #编辑者
"keywords":"", # 文章自带关键词
"datetime":"", # 文章采集时间
"imageAttachment":"null", #图片附件
"fileAttachment":"null", # 文件附件
"html":"",
}
source_name = "中国金融网"
source_list = [
{
"url": "http://m.duaixs.com/articles/?template=sample_397.html&page=%s",
"source_channel": "新闻",
},
{
"url": "http://www.duaixs.com/articleList/1.shtml?page=%s",
"source_channel": "焦点", "source_name": "中国金融网"
}
]
# headers=headers,
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=2 * 60)
def on_start(self):
for source in source_list:
url = source['url']
source_channel = source['source_channel']
for i in range(1,2):
self.crawl(url % str(i),headers=headers, callback=self.index_page, save=source)
@config(age=1)
def index_page(self, response):
for each in response.doc('dl.dl_artListB dt a').items():
href = each.attr.href
if href:
self.crawl(href,headers=headers, callback=self.detail_page,save=response.save)
@config(priority=2,age=10 * 24 * 60 * 60)
def detail_page(self, response):
result = copy.deepcopy(result_template)
result["url"] = response.url
result["source_channel"] = response.save['source_channel']
result["source_name"] = source_name
if response.doc('div.contDetailsBox').html():
result["html"]
杰奇小说Pyspider脚本编写指南
最新推荐文章于 2020-05-18 16:22:54 发布