scrapy 爬虫项目

最新推荐文章于 2022-10-21 15:48:10 发布
陈乃荣
最新推荐文章于 2022-10-21 15:48:10 发布
阅读量304
点赞数 1
本文链接：https://blog.csdn.net/cnr233/article/details/118619621
版权
import copy
import json
import time
import re
import scrapy

import spider_tools
from risk.items import RiskItem

class GatherMzlfyChinacourtGovSpider(scrapy.Spider):
    a = []
    s = {}
    name = 'gather_mzlfy_chinacourt_gov'
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Cookie": "acw_tc=78ddf8b616271985576107483ebcde43e16f3d1437e136a8347e486c41; UM_distinctid=17adc978e1c10-0fc909c16832e9-5771031-1fa400-17adc978e1dbca; CNZZDATA1279242708=1748023490-1627197080-%7C1627197080; PHPSESSID=r60e7es66skhf2r1rubpqouk93"
    }
    page1 = 1
    page2 = 1
    up_data = None
    def __init__(self, *args, **kwargs):
        super(GatherMzlfyChinacourtGovSpider, self).__init__(*args, **kwargs)
        # self.website_title = website_title
        self.website_title = "满洲里市人民法院"

    def start_requests(self):
        url = ['http://mzlfy.chinacourt.gov.cn/article/index/id/MzRLNzAwNTAwNiACAAA/page/1.shtml','http://mzlfy.chinacourt.gov.cn/article/index/id/MzRLNzAwNTAwMiACAAA/page/1.shtml',]
        for i in url:
            yield scrapy.Request(url=i, headers=self.headers,
                                 callback=self.spider_fy,meta={'url':i})
    def spider_fy(self,response):
        url = response.meta['url']
        url_len=response.xpath("//div[@class='yui3-u-3-4 fr']//div[@id='list']/ul/li/span[1]//a/@title").getall()
        if(len(url_len)>0):
            if(url.find("MzRLNzAwNTAwNiACAAA" )!= -1):
                print(response)
                url_all =  response.xpath("//div[@class='yui3-u-3-4 fr']//div[@id='list']/ul/li/span[1]//a/@href").getall()
                page = self.page1
                for url_index, u in enumerate(url_all):
                    nu = 'http://mzlfy.chinacourt.gov.cn/'+u
                    yield scrapy.Request(url = nu,headers=self.headers,meta={'page':page,"url_index": url_index,'url':nu,"list_url":url})


                self.page1 += 1
                numb = url.split("/")[-1].split(".")[0]
                url = url.replace(numb,str(self.page1))
                yield scrapy.Request(url=url, headers=self.headers, dont_filter=True, callback=self.spider_fy,meta={'url':url})
            if(url.find("MzRLNzAwNTAwMiACAAA" )!= -1):
                url_all =  response.xpath("//div[@class='yui3-u-3-4 fr']//div[@id='list']/ul/li/span[1]//a/@href").getall()
                if(url_all == self.up_data):
                    return
                print(response)
                page = self.page2
                for url_index, u in enumerate(url_all):
                    nu = 'http://mzlfy.chinacourt.gov.cn/'+u
                    yield scrapy.Request(url = nu,headers=self.headers,meta={'page':page,"url_index": url_index,'url':nu,"list_url":url})
                self.up_data = url_all



                self.page2 += 1
                numb = url.split("/")[-1].split(".")[0]
                url = url.replace(numb,str(self.page2))
                yield scrapy.Request(url=url, headers=self.headers, dont_filter=True, callback=self.spider_fy,meta={'url':url})

        else:
            pass

    def parse(self, response):
        meta = copy.copy(response.meta)
        url_index = meta['url_index']
        page_num = meta['page']
        classification_rank = str((int(page_num) - 1) * 45 + url_index + 1)
        title = response.xpath('//div[@class="title"]/div[@class="b_title"]//text()').getall()
        title = "".join(title).strip() if title else None
        content = response.xpath("//div[@class='text']//text()").getall()
        content = "".join(content).strip() if content else None
        release_time = response.xpath("//div[@class='sth_a']/span/text()").getall()[0]
        release_time = "".join(re.findall("[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}",release_time))
        url = meta['url']
        url_hash = spider_tools.md5_hash(url)
        url_hash = "".join(url_hash).strip() if url_hash else None
        platform = self.website_title
        platform = "".join(platform).strip() if platform else None
        if content == None:
            simhash = None
        else:
            simhash = str(spider_tools.simhash(content))
        title_hash=spider_tools.md5_hash_title(title)
        category = '风险'
        list_url = response.meta['list_url']
        gather_time = spider_tools.timestamp_transfer(time.time() * 1000)

        cover_pic_url = None
        abstract = None
        raw_source = "".join(response.xpath("//div[@class='sth_b']/div[@class='from']/text()").getall()).replace("来源：","").strip()
        if len(raw_source) ==0 :
            raw_source = self.website_title
        transport_count = None
        author_msg = response.xpath("//div[@class='sth_a']/span/text()").getall()[0].strip()
        author_msg = "".join(re.findall("作者：(.*?)发布时间",author_msg)).strip()
        if len(author_msg) ==0:
            author_msg = self.website_title
        participation_count = None
        collection_count = None
        share_url = None

        replay_msg_list= None
        author_articles_number= None
        author_follow_number= None
        author_fans_number= None
        author_acticles_all_read_count= None
        author_acticles_all_comment_count= None
        author_comment_number= None

        zd = {
            'classification_rank':classification_rank,
            'title':title,
            'content':content,
            'release_time':release_time,
            'url':url,
            'url_hash':url_hash,
            'platform':platform,
            'simhash':simhash,
            'title_hash':title_hash,
            'category':category,
            'list_url':str(list_url),
            'cover_pic_url':cover_pic_url,
            'abstract':abstract,
            'raw_source':raw_source,
            'transport_count':transport_count,
            'author_msg':author_msg,
            'participation_count':participation_count,
            'collection_count':collection_count,
            'share_url':share_url,
            'replay_msg_list':replay_msg_list,
            'author_articles_number':author_articles_number,
            'author_follow_number':author_follow_number,
            'author_fans_number':author_fans_number,
            'author_acticles_all_read_count':author_acticles_all_read_count,
            'author_acticles_all_comment_count':author_acticles_all_comment_count,
            'author_comment_number':author_comment_number,
            'gather_time':gather_time,
            'read_count':None,
            'like_count':None,
            'comment_count':None
        }
        item = RiskItem()
        item.update(copy.copy(zd))
        self.a.append(item['title'])
        # print("原长度是:",len(self.a))
        # print(self.a)
        # print("去重长度是:",len(set(self.a)))
        # print(set(self.a))

        # with open("aaa.txt",'a+',encoding='utf-8') as f:
        #     f.write(str(item))
        yield item