Pyspider脚本编写

浩哥爱吃肉

于 2019-02-12 16:54:59 发布

阅读量419

点赞数

分类专栏：技术

本文链接：https://blog.csdn.net/ZHH_Love123/article/details/87098660

版权

技术专栏收录该内容

52 篇文章 0 订阅

订阅专栏

基本脚本


import sys
reload(sys)
sys.setdefaultencoding('utf8') 
import copy
import json
import time,datetime
import re
from pyspider.libs.base_handler import *
from pyquery import PyQuery as pq

result_template = {
    "info_id":"",    # 资讯信息编号（自增）
    "url":"",          # 原文URL
    "title":"",        # 标题
    "subheading":"",    # 副标题
    "fetch_time":"",
    "pub_time":"",      # 发布时间 文章内容中的发布时间，并非爬虫爬去到文章的时间
    "sort":"",        # 分类接口 ？ 
    "summary":"",       # 资讯信息摘要
    "content":"",      #正文
    "persons":"",      # 涉及到的人
    "companys":"",     # 涉及到的公司
    "stocknames":"",    # 涉及到的股票
    "stockcodes":"",    # 涉及到的股票代码
    "industries":"",  # 涉及的行业
    "sections":"",     # 涉及的板块
    "others":"",
    "info_type":"",     # 文章所属类型  公告 / 新闻
    "source":"",   # 发布单位
    "info_channel":"",    # 2级标题/频道及以下所有标题/频道。不同频道之间，使用下划线"_"连接，不包含"首页"及"正文"。
    "editor":"",      #编辑者
    "keywords":"",       # 文章自带关键词
    "datetime":"",    # 文章采集时间   
    "imageAttachment":"null",    #图片附件
    "fileAttachment":"null",     # 文件附件
    
    "html":"",
}

source_name = "中国金融网"
source_list = [
    {
        "url": "http://www.cnfinance.cn/articles/?template=sample_397.html&page=%s",  
        "source_channel": "新闻",       
    },
    {
        "url": "http://www.financeun.com/articleList/1.shtml?page=%s",  
        "source_channel": "焦点",     "source_name": "中国金融网"
    }   
]

# headers=headers,

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}
class Handler(BaseHandler):
    crawl_config = {
    }

    @every(minutes=2 * 60)  
    def on_start(self):
        for source in source_list:
            url = source['url']
            source_channel = source['source_channel']
           
            for i in range(1,2):
                self.crawl(url % str(i),headers=headers, callback=self.index_page, save=source) 
                      
    
    @config(age=1)
    def index_page(self, response):
        for each in response.doc('dl.dl_artListB dt a').items():
            href = each.attr.href
            if href:
                self.crawl(href,headers=headers, callback=self.detail_page,save=response.save)
            
            
    @config(priority=2,age=10 * 24 * 60 * 60)
    def detail_page(self, response):
        result = copy.deepcopy(result_template)
        result["url"] =  response.url
        result["source_channel"] = response.save['source_channel']
        result["source_name"] = source_name
        
        if response.doc('div.contDetailsBox').html():           
            result["html"] = response.doc('div.contDetailsBox').html().strip()
            result["editor"]  = response.doc('p.p_author.span').text().replace('作者：','')
            result["source"] = response.doc(' p.p_artInfo span ').eq(1).text().replace('摘自：','')
            result["title"] = response.doc('h2.h2_artDetails').text()
      
            result["pub_time"] = response.doc('p.p_artInfo span ').eq(0).text().replace(u'年', '-').replace(u'月', '-').replace(u'日', '')
        
        result["content"] = get_content_from_html(result["html"])
        result["pub_time"] = str_2_timestamp(result["pub_time"])
        result["pub_time"] = get_pub_time(result["pub_time"])
        result["datetime"] = get_now_time()
        self.send_message(self.project_name, result, url=result["url"])
      
   
    def json_handler(self, response):
        result = copy.deepcopy(result_template)
        data = json.loads(response.text)
        result["title"] = response.save['title']
        result["author"] = response.save['author']
        html = "<h1>%s</h1>" % response.save['title']
        html += data['data']['content']
        result['html'] = html
        result["content"] = get_content_from_html(html)
        result["summary"] = data['data']['content_short']
        result['pub_time'] = timestamp_to_str(response.save['display_time'])
        self.send_message(self.project_name, result, url=result["url"])
     
    def on_message(self, project, msg):
        return msg
    
def get_content(response):
    import chardet
    from readability import Document
    import html2text
    char_encoding = chardet.detect(response.content)   # bytes
    #print(char_encoding)
    if char_encoding["encoding"] == "utf-8" or char_encoding["encoding"] == "utf8":
        doc = Document(response.content.decode("utf-8"))
    else:
        doc = Document(response.content.decode("gbk","ignore"))
    title = doc.title()
    content = doc.summary()
    h = html2text.HTML2Text()
    h.ignore_links = True
    # h.ignore_images = True
    d_data = h.handle(content).replace("-\n","-")
    return d_data.rstrip()     
       
def str_2_timestamp(time_str, fmt="%Y-%m-%d %H:%M:%S"):
    if not time_str:
        return ""
    elif len(time_str) == 9:
        fmt = "%Y-%m-%d"
    elif len(time_str) == 10:
        fmt = "%Y-%m-%d"
    elif len(time_str) == 13:
        fmt = "%Y-%m-%d %H"
    elif len(time_str) == 16:
        fmt = "%Y-%m-%d %H:%M"
    return int(time.mktime(time.strptime(time_str, fmt)))

def get_content_from_html(html):
    import html2text
    h = html2text.HTML2Text()
    h.ignore_links = True
    # h.ignore_images = True
    d_data = h.handle(html).replace("-\n","-")
    return d_data.rstrip() 

def get_pub_time(response):
    #date_time = response.doc('div.content div.titleHead div.newsDate').text()
    #date_time = response.doc("div#article.article span#pubtime_baidu").text()
    
    #return date_time
    
    #timeArray = time.strptime(response, "%Y-%m-%d %H:%M:%S")
    #转换成时间戳
    #timestamp = time.mktime(timeArray)
    return str(response*10)[0:10]
   
def re_search_time(time_str):
    r_str = r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2}|\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}|\d{4}-\d{1,2}-\d{1,2})"
    mat = re.search(r_str, time_str)
    if not mat:
        return ""
    return mat.group(0)
 
def re_sub_html(html):
    return re.sub(r'<!--.*?-->','',html)
   
def get_now_time():
    return str(int(time.time()))

基本处理


标签拼接
html = "<h1>%s</h1>" % result["title"]
html += "<div>%s</div>" % response.doc("div#News_Body_Txt_A").html()
result["title"] = response.doc('div.lasttle>p').remove() # 删除标签
result["pub_time"] = response.doc(' div.source ').addClass('beauty')
result["source"] = response.doc('div.article-info > span:contains("来源")').text()
result["html"] = response.doc('div.article_content').remove('div.article_content>div:last-child').html()
result['source'] = response.doc('div.attr span').eq(1).text().replace(u'来源：', '') # eq(index)
response.doc('ul#news_list >li:nth-child(-n+20) > div[class|="txt"] > h3 > a') # 前20个标签
div.newsCon section:nth-child(2) p:nth-last-child(-n+3) 后三个p标签
response.doc("td.STYLE4").parent().parent()('tr').eq(1).text()  eq(index) 0是第一行 1是第二行
response.doc('div.weicode').nextAll().remove()
response.doc("div.Middle4 div.Middle4_body div.txt1").nextAll().remove()

content_table = response.doc("div.portlet table.hui12").siblings('table')
对于tbody table tr td 过多的网页提取标签 可以只拿table标签 body > table table table table:nth-child(2) table table td

.next() 获得匹配元素集合中每个元素紧邻的同胞元素。如果提供选择器，则取回匹配该选择器的下一个同胞元素。

1、.next()方法的作用：指针指向下一条记录，有记录（有值）返回true并把记录内容存入到对应的对象中，也就是obj.next()的obj中。如果没有返回false。

2、.next()方法的应用：一般和ResultSet对象和while循环一起使用，去迭代结果集，并在循环中调用getXXX(intfieldIndex)/getXXX(String columnName)方法获取字段值。


 page = source['page'] if 'page' in source else num  可以指定page=1不翻页  指定全局变量num 翻num页的  就不用再根据频道判断是否翻页

删除标签操作
result["html"] = re.sub(r'\<section.*?\<\/section\>','',result["html"],flags=re.S)
result["html"] = re.sub(r'<\/section\>','',result["html"],flags=re.S)
html = pq(result["html"])
result["html"] = html.remove('section').html().replace('\n  \n','')

获取content的另一种方式 
def get_content_from_html(html):
    import html2text
    h = html2text.HTML2Text()
    h.ignore_links = True
    # h.ignore_images = True
    d_data = h.handle(html).replace("-\n","-")
    return d_data.rstrip() 

def re_search_time(time_str): 正则处理时间
    r_str = r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2}|\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}|\d{4}-\d{1,2}-\d{1,2})"
    mat = re.search(r_str, time_str)
    if not mat:
        return ""
    return mat.group(0)

def make(time):时间处理
	"n天、小时、分钟、前时间"  print (datetime.datetime.now()-datetime.timedelta(days=1)).strftime("%Y-%m-%d %H:%M")
	str(datetime.datetime.now().month) 获取 当前的年份、月份、日
	处理"今天08:56"这样的时间 time.strftime('%Y-%m-%d', time.localtime())+' '+result["pub_time"].replace('今天','') 或者
		str(datetime.datetime.now().month) + ' '+result["pub_time"].replace('今天','')
	处理a = "20181107" 可以用切片 拼接 a_time = '-'.join([a[0:4],a[4:6],a[6:8]])


def code(): 编码问题解决
	json.loads(unicode) 直接将Unicode转为json
	data.decode("unicode-escape").encode('utf-8') # unicode 转中文


pyquery对象，遍历节点 在对节点操作
def index_page(self, response):
        for content in response.doc('article.post div.content').items():
            data= {
            'url': response.save['url'],
            'source_channel' : response.save['source_channel'],
            'source_name' : response.save['source_name'],
            'pub_time':content('div.data span.u-time').text()
                   
            }
            print data
            self.crawl(content('h2>a').attr.href, callback=self.detail_page,save=data)

pyspider 默认去重不会对同一地址请求 
循环post请求 地址相同 需要强制对同一URL循环爬取 解决办法发  加上itag参数  记得url后边要加上？问好号
def index_page(self, response):
    detail_url = 'http://news.cqcoal.com/manage/newsaction.do?method:getNewsAddonarticle'
    dict_list = response.json['rows']   

    for aid in dict_list:  
        print aid['id']
        url = '%s?id=%s' % (detail_url,'')
        data= {
        'source_channel' : response.save['source_channel'],
        'source_name' : response.save['source_name'],
        'source_typeide' : response.save['typeid'],
        'pub_time': aid['pubdate'],
        'title': aid['title'],
        'editor': aid['fshuser'],
        'source': aid['source'],
        'content': aid['description']       
        }
        timestampStr = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
        fakeItagForceRecrawl = "%s" % timestampStr
        url = detail_url + "#" + timestampStr
        self.crawl(url,itag=fakeItagForceRecrawl,data={'id':aid['id']}, headers=headers,callback=self.detail_page, save=data, method='POST') 
				    

@config(age=1)
    def index_page(self, response):
        source_channel = response.save['source_channel']
        itemList =response.doc('div.trends').items()
        for each in itemList:
            param = {}
            href = each('h3 a').attr.href
            title = each('a').text()
            print('title:'+title)
            pub_time = each('span').text()
            print('pub_time:'+pub_time)
            param["title"] = title
            param["pub_time"] = pub_time
            param.update(response.save)
            if href:
                self.crawl(href, callback=self.detail_page,save = param,headers=header)



@config(priority=1, age=10*24*60*60)
    def detail_page(self, response):
        result = copy.deepcopy(result_template)
        result.update(response.save)
        result["url"] = response.url
        source_channel = response.save['source_channel']
        result["source_name"] = source_name

        result["fetch_time"] = get_now_time()

        # 需要根据网页内容进行相应提取
        html = None
        html = response.doc("div#pageTxt").html()
        if html:
            result["html"] = html 
            result["content"] = get_content_from_html(html)
            result["title"] = response.save['title']
            pub_time =""
            if response.save.__contains__('pub_time'):
                pub_time = response.save['pub_time']
            else:
                temp = response.doc("div.scd-title em").text()
                if temp.find("今天") !=-1:
                    temp = "%s%s%s%s%s" %(str(datetime.datetime.now().month),"-",str(datetime.datetime.now().day)," ",temp.replace("今天",""))
                else:
                    temp = temp.replace("月","-").replace("日","")
                pub_time ="%s%s%s" %(str(datetime.datetime.now().year),"-",temp)
            print("pub_time:"+pub_time)
            source="蓝鲸TMT网"  
            print("source:"+source)
            result["source"] = source
            result["pub_time"] = str_2_timestamp(pub_time)
   
            self.send_message(self.project_name, result,url=result["url"])
def str_2_timestamp(time_str, fmt="%Y-%m-%d %H:%M:%S"):
    if not time_str:
        return ""
   
    elif time_str.find(':') == -1:
        fmt = "%Y-%m-%d"
    elif len(re.findall(r':',time_str)) == 1:  
        fmt = "%Y-%m-%d %H:%M"
    elif len(re.findall(r':',time_str)) == 2:
        fmt = "%Y-%m-%d %H:%M:%S"
    return int(time.mktime(time.strptime(time_str, fmt)))

浩哥爱吃肉

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Pyspider脚本编写

基本脚本import sysreload(sys)sys.setdefaultencoding('utf8') import copyimport jsonimport time,datetimeimport refrom pyspider.libs.base_handler import *from pyquery import PyQuery as pqresult...
复制链接

扫一扫

专栏目录