基本脚本
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import copy
import json
import time,datetime
import re
from pyspider.libs.base_handler import *
from pyquery import PyQuery as pq
result_template = {
"info_id":"", # 资讯信息编号(自增)
"url":"", # 原文URL
"title":"", # 标题
"subheading":"", # 副标题
"fetch_time":"",
"pub_time":"", # 发布时间 文章内容中的发布时间,并非爬虫爬去到文章的时间
"sort":"", # 分类接口 ?
"summary":"", # 资讯信息摘要
"content":"", #正文
"persons":"", # 涉及到的人
"companys":"", # 涉及到的公司
"stocknames":"", # 涉及到的股票
"stockcodes":"", # 涉及到的股票代码
"industries":"", # 涉及的行业
"sections":"", # 涉及的板块
"others":"",
"info_type":"", # 文章所属类型 公告 / 新闻
"source":"", # 发布单位
"info_channel":"", # 2级标题/频道及以下所有标题/频道。不同频道之间,使用下划线"_"连接,不包含"首页"及"正文"。
"editor":"", #编辑者
"keywords":"", # 文章自带关键词
"datetime":"", # 文章采集时间
"imageAttachment":"null", #图片附件
"fileAttachment":"null", # 文件附件
"html":"",
}
source_name = "中国金融网"
source_list = [
{
"url": "http://www.cnfinance.cn/articles/?template=sample_397.html&page=%s",
"source_channel": "新闻",
},
{
"url": "http://www.financeun.com/articleList/1.shtml?page=%s",
"source_channel": "焦点", "source_name": "中国金融网"
}
]
# headers=headers,
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=2 * 60)
def on_start(self):
for source in source_list:
url = source['url']
source_channel = source['source_channel']
for i in range(1,2):
self.crawl(url % str(i),headers=headers, callback=self.index_page, save=source)
@config(age=1)
def index_page(self, response):
for each in response.doc('dl.dl_artListB dt a').items():
href = each.attr.href
if href:
self.crawl(href,headers=headers, callback=self.detail_page,save=response.save)
@config(priority=2,age=10 * 24 * 60 * 60)
def detail_page(self, response):
result = copy.deepcopy(result_template)
result["url"] = response.url
result["source_channel"] = response.save['source_channel']
result["source_name"] = source_name
if response.doc('div.contDetailsBox').html():
result["html"] = response.doc('div.contDetailsBox').html().strip()
result["editor"] = response.doc('p.p_author.span').text().replace('作者:','')
result["source"] = response.doc(' p.p_artInfo span ').eq(1).text().replace('摘自:','')
result["title"] = response.doc('h2.h2_artDetails').text()
result["pub_time"] = response.doc('p.p_artInfo span ').eq(0).text().replace(u'年', '-').replace(u'月', '-').replace(u'日', '')
result["content"] = get_content_from_html(result["html"])
result["pub_time"] = str_2_timestamp(result["pub_time"])
result["pub_time"] = get_pub_time(result["pub_time"])
result["datetime"] = get_now_time()
self.send_message(self.project_name, result, url=result["url"])
def json_handler(self, response):
result = copy.deepcopy(result_template)
data = json.loads(response.text)
result["title"] = response.save['title']
result["author"] = response.save['author']
html = "<h1>%s</h1>" % response.save['title']
html += data['data']['content']
result['html'] = html
result["content"] = get_content_from_html(html)
result["summary"] = data['data']['content_short']
result['pub_time'] = timestamp_to_str(response.save['display_time'])
self.send_message(self.project_name, result, url=result["url"])
def on_message(self, project, msg):
return msg
def get_content(response):
import chardet
from readability import Document
import html2text
char_encoding = chardet.detect(response.content) # bytes
#print(char_encoding)
if char_encoding["encoding"] == "utf-8" or char_encoding["encoding"] == "utf8":
doc = Document(response.content.decode("utf-8"))
else:
doc = Document(response.content.decode("gbk","ignore"))
title = doc.title()
content = doc.summary()
h = html2text.HTML2Text()
h.ignore_links = True
# h.ignore_images = True
d_data = h.handle(content).replace("-\n","-")
return d_data.rstrip()
def str_2_timestamp(time_str, fmt="%Y-%m-%d %H:%M:%S"):
if not time_str:
return ""
elif len(time_str) == 9:
fmt = "%Y-%m-%d"
elif len(time_str) == 10:
fmt = "%Y-%m-%d"
elif len(time_str) == 13:
fmt = "%Y-%m-%d %H"
elif len(time_str) == 16:
fmt = "%Y-%m-%d %H:%M"
return int(time.mktime(time.strptime(time_str, fmt)))
def get_content_from_html(html):
import html2text
h = html2text.HTML2Text()
h.ignore_links = True
# h.ignore_images = True
d_data = h.handle(html).replace("-\n","-")
return d_data.rstrip()
def get_pub_time(response):
#date_time = response.doc('div.content div.titleHead div.newsDate').text()
#date_time = response.doc("div#article.article span#pubtime_baidu").text()
#return date_time
#timeArray = time.strptime(response, "%Y-%m-%d %H:%M:%S")
#转换成时间戳
#timestamp = time.mktime(timeArray)
return str(response*10)[0:10]
def re_search_time(time_str):
r_str = r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2}|\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}|\d{4}-\d{1,2}-\d{1,2})"
mat = re.search(r_str, time_str)
if not mat:
return ""
return mat.group(0)
def re_sub_html(html):
return re.sub(r'<!--.*?-->','',html)
def get_now_time():
return str(int(time.time()))
基本处理
标签拼接
html = "<h1>%s</h1>" % result["title"]
html += "<div>%s</div>" % response.doc("div#News_Body_Txt_A").html()
result["title"] = response.doc('div.lasttle>p').remove() # 删除标签
result["pub_time"] = response.doc(' div.source ').addClass('beauty')
result["source"] = response.doc('div.article-info > span:contains("来源")').text()
result["html"] = response.doc('div.article_content').remove('div.article_content>div:last-child').html()
result['source'] = response.doc('div.attr span').eq(1).text().replace(u'来源:', '') # eq(index)
response.doc('ul#news_list >li:nth-child(-n+20) > div[class|="txt"] > h3 > a') # 前20个标签
div.newsCon section:nth-child(2) p:nth-last-child(-n+3) 后三个p标签
response.doc("td.STYLE4").parent().parent()('tr').eq(1).text() eq(index) 0是第一行 1是第二行
response.doc('div.weicode').nextAll().remove()
response.doc("div.Middle4 div.Middle4_body div.txt1").nextAll().remove()
content_table = response.doc("div.portlet table.hui12").siblings('table')
对于tbody table tr td 过多的网页提取标签 可以只拿table标签 body > table table table table:nth-child(2) table table td
.next() 获得匹配元素集合中每个元素紧邻的同胞元素。如果提供选择器,则取回匹配该选择器的下一个同胞元素。
1、.next()方法的作用:指针指向下一条记录,有记录(有值)返回true并把记录内容存入到对应的对象中,也就是obj.next()的obj中。如果没有返回false。
2、.next()方法的应用:一般和ResultSet对象和while循环一起使用,去迭代结果集,并在循环中调用getXXX(intfieldIndex)/getXXX(String columnName)方法获取字段值。
page = source['page'] if 'page' in source else num 可以指定page=1不翻页 指定全局变量num 翻num页的 就不用再根据频道判断是否翻页
删除标签操作
result["html"] = re.sub(r'\<section.*?\<\/section\>','',result["html"],flags=re.S)
result["html"] = re.sub(r'<\/section\>','',result["html"],flags=re.S)
html = pq(result["html"])
result["html"] = html.remove('section').html().replace('\n \n','')
获取content的另一种方式
def get_content_from_html(html):
import html2text
h = html2text.HTML2Text()
h.ignore_links = True
# h.ignore_images = True
d_data = h.handle(html).replace("-\n","-")
return d_data.rstrip()
def re_search_time(time_str): 正则处理时间
r_str = r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2}|\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}|\d{4}-\d{1,2}-\d{1,2})"
mat = re.search(r_str, time_str)
if not mat:
return ""
return mat.group(0)
def make(time):时间处理
"n天、小时、分钟、前时间" print (datetime.datetime.now()-datetime.timedelta(days=1)).strftime("%Y-%m-%d %H:%M")
str(datetime.datetime.now().month) 获取 当前的年份、月份、日
处理"今天08:56"这样的时间 time.strftime('%Y-%m-%d', time.localtime())+' '+result["pub_time"].replace('今天','') 或者
str(datetime.datetime.now().month) + ' '+result["pub_time"].replace('今天','')
处理a = "20181107" 可以用切片 拼接 a_time = '-'.join([a[0:4],a[4:6],a[6:8]])
def code(): 编码问题解决
json.loads(unicode) 直接将Unicode转为json
data.decode("unicode-escape").encode('utf-8') # unicode 转中文
pyquery对象,遍历节点 在对节点操作
def index_page(self, response):
for content in response.doc('article.post div.content').items():
data= {
'url': response.save['url'],
'source_channel' : response.save['source_channel'],
'source_name' : response.save['source_name'],
'pub_time':content('div.data span.u-time').text()
}
print data
self.crawl(content('h2>a').attr.href, callback=self.detail_page,save=data)
pyspider 默认去重不会对同一地址请求
循环post请求 地址相同 需要强制对同一URL循环爬取 解决办法发 加上itag参数 记得url后边要加上?问好号
def index_page(self, response):
detail_url = 'http://news.cqcoal.com/manage/newsaction.do?method:getNewsAddonarticle'
dict_list = response.json['rows']
for aid in dict_list:
print aid['id']
url = '%s?id=%s' % (detail_url,'')
data= {
'source_channel' : response.save['source_channel'],
'source_name' : response.save['source_name'],
'source_typeide' : response.save['typeid'],
'pub_time': aid['pubdate'],
'title': aid['title'],
'editor': aid['fshuser'],
'source': aid['source'],
'content': aid['description']
}
timestampStr = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
fakeItagForceRecrawl = "%s" % timestampStr
url = detail_url + "#" + timestampStr
self.crawl(url,itag=fakeItagForceRecrawl,data={'id':aid['id']}, headers=headers,callback=self.detail_page, save=data, method='POST')
@config(age=1)
def index_page(self, response):
source_channel = response.save['source_channel']
itemList =response.doc('div.trends').items()
for each in itemList:
param = {}
href = each('h3 a').attr.href
title = each('a').text()
print('title:'+title)
pub_time = each('span').text()
print('pub_time:'+pub_time)
param["title"] = title
param["pub_time"] = pub_time
param.update(response.save)
if href:
self.crawl(href, callback=self.detail_page,save = param,headers=header)
@config(priority=1, age=10*24*60*60)
def detail_page(self, response):
result = copy.deepcopy(result_template)
result.update(response.save)
result["url"] = response.url
source_channel = response.save['source_channel']
result["source_name"] = source_name
result["fetch_time"] = get_now_time()
# 需要根据网页内容进行相应提取
html = None
html = response.doc("div#pageTxt").html()
if html:
result["html"] = html
result["content"] = get_content_from_html(html)
result["title"] = response.save['title']
pub_time =""
if response.save.__contains__('pub_time'):
pub_time = response.save['pub_time']
else:
temp = response.doc("div.scd-title em").text()
if temp.find("今天") !=-1:
temp = "%s%s%s%s%s" %(str(datetime.datetime.now().month),"-",str(datetime.datetime.now().day)," ",temp.replace("今天",""))
else:
temp = temp.replace("月","-").replace("日","")
pub_time ="%s%s%s" %(str(datetime.datetime.now().year),"-",temp)
print("pub_time:"+pub_time)
source="蓝鲸TMT网"
print("source:"+source)
result["source"] = source
result["pub_time"] = str_2_timestamp(pub_time)
self.send_message(self.project_name, result,url=result["url"])
def str_2_timestamp(time_str, fmt="%Y-%m-%d %H:%M:%S"):
if not time_str:
return ""
elif time_str.find(':') == -1:
fmt = "%Y-%m-%d"
elif len(re.findall(r':',time_str)) == 1:
fmt = "%Y-%m-%d %H:%M"
elif len(re.findall(r':',time_str)) == 2:
fmt = "%Y-%m-%d %H:%M:%S"
return int(time.mktime(time.strptime(time_str, fmt)))