import copy
import json
import time
import re
import scrapy
import spider_tools
from risk.items import RiskItem
class GatherMzlfyChinacourtGovSpider(scrapy.Spider):
a = []
s = {}
name = 'gather_mzlfy_chinacourt_gov'
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Cookie": "acw_tc=78ddf8b616271985576107483ebcde43e16f3d1437e136a8347e486c41; UM_distinctid=17adc978e1c10-0fc909c16832e9-5771031-1fa400-17adc978e1dbca; CNZZDATA1279242708=1748023490-1627197080-%7C1627197080; PHPSESSID=r60e7es66skhf2r1rubpqouk93"
}
page1 = 1
page2 = 1
up_data = None
def __init__(self, *args, **kwargs):
super(GatherMzlfyChinacourtGovSpider, self).__init__(*args, **kwargs)
# self.website_title = website_title
self.website_title = "满洲里市人民法院"
def start_requests(self):
url = ['http://mzlfy.chinacourt.gov.cn/article/index/id/MzRLNzAwNTAwNiACAAA/page/1.shtml','http://mzlfy.chinacourt.gov.cn/article/index/id/MzRLNzAwNTAwMiACAAA/page/1.shtml',]
for i in url:
yield scrapy.Request(url=i, headers=self.headers,
callback=self.spider_fy,meta={'url':i})
def spider_fy(self,response):
url = response.meta['url']
url_len=response.xpath("//div[@class='yui3-u-3-4 fr']//div[@id='list']/ul/li/span[1]//a/@title").getall()
if(len(url_len)>0):
if(url.find("MzRLNzAwNTAwNiACAAA" )!= -1):
print(response)
url_all = response.xpath("//div[@class='yui3-u-3-4 fr']//div[@id='list']/ul/li/span[1]//a/@href").getall()
page = self.page1
for url_index, u in enumerate(url_all):
nu = 'http://mzlfy.chinacourt.gov.cn/'+u
yield scrapy.Request(url = nu,headers=self.headers,meta={'page':page,"url_index": url_index,'url':nu,"list_url":url})
self.page1 += 1
numb = url.split("/")[-1].split(".")[0]
url = url.replace(numb,str(self.page1))
yield scrapy.Request(url=url, headers=self.headers, dont_filter=True, callback=self.spider_fy,meta={'url':url})
if(url.find("MzRLNzAwNTAwMiACAAA" )!= -1):
url_all = response.xpath("//div[@class='yui3-u-3-4 fr']//div[@id='list']/ul/li/span[1]//a/@href").getall()
if(url_all == self.up_data):
return
print(response)
page = self.page2
for url_index, u in enumerate(url_all):
nu = 'http://mzlfy.chinacourt.gov.cn/'+u
yield scrapy.Request(url = nu,headers=self.headers,meta={'page':page,"url_index": url_index,'url':nu,"list_url":url})
self.up_data = url_all
self.page2 += 1
numb = url.split("/")[-1].split(".")[0]
url = url.replace(numb,str(self.page2))
yield scrapy.Request(url=url, headers=self.headers, dont_filter=True, callback=self.spider_fy,meta={'url':url})
else:
pass
def parse(self, response):
meta = copy.copy(response.meta)
url_index = meta['url_index']
page_num = meta['page']
classification_rank = str((int(page_num) - 1) * 45 + url_index + 1)
title = response.xpath('//div[@class="title"]/div[@class="b_title"]//text()').getall()
title = "".join(title).strip() if title else None
content = response.xpath("//div[@class='text']//text()").getall()
content = "".join(content).strip() if content else None
release_time = response.xpath("//div[@class='sth_a']/span/text()").getall()[0]
release_time = "".join(re.findall("[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}",release_time))
url = meta['url']
url_hash = spider_tools.md5_hash(url)
url_hash = "".join(url_hash).strip() if url_hash else None
platform = self.website_title
platform = "".join(platform).strip() if platform else None
if content == None:
simhash = None
else:
simhash = str(spider_tools.simhash(content))
title_hash=spider_tools.md5_hash_title(title)
category = '风险'
list_url = response.meta['list_url']
gather_time = spider_tools.timestamp_transfer(time.time() * 1000)
cover_pic_url = None
abstract = None
raw_source = "".join(response.xpath("//div[@class='sth_b']/div[@class='from']/text()").getall()).replace("来源:","").strip()
if len(raw_source) ==0 :
raw_source = self.website_title
transport_count = None
author_msg = response.xpath("//div[@class='sth_a']/span/text()").getall()[0].strip()
author_msg = "".join(re.findall("作者:(.*?)发布时间",author_msg)).strip()
if len(author_msg) ==0:
author_msg = self.website_title
participation_count = None
collection_count = None
share_url = None
replay_msg_list= None
author_articles_number= None
author_follow_number= None
author_fans_number= None
author_acticles_all_read_count= None
author_acticles_all_comment_count= None
author_comment_number= None
zd = {
'classification_rank':classification_rank,
'title':title,
'content':content,
'release_time':release_time,
'url':url,
'url_hash':url_hash,
'platform':platform,
'simhash':simhash,
'title_hash':title_hash,
'category':category,
'list_url':str(list_url),
'cover_pic_url':cover_pic_url,
'abstract':abstract,
'raw_source':raw_source,
'transport_count':transport_count,
'author_msg':author_msg,
'participation_count':participation_count,
'collection_count':collection_count,
'share_url':share_url,
'replay_msg_list':replay_msg_list,
'author_articles_number':author_articles_number,
'author_follow_number':author_follow_number,
'author_fans_number':author_fans_number,
'author_acticles_all_read_count':author_acticles_all_read_count,
'author_acticles_all_comment_count':author_acticles_all_comment_count,
'author_comment_number':author_comment_number,
'gather_time':gather_time,
'read_count':None,
'like_count':None,
'comment_count':None
}
item = RiskItem()
item.update(copy.copy(zd))
self.a.append(item['title'])
# print("原长度是:",len(self.a))
# print(self.a)
# print("去重长度是:",len(set(self.a)))
# print(set(self.a))
# with open("aaa.txt",'a+',encoding='utf-8') as f:
# f.write(str(item))
yield item
scrapy 爬虫项目
最新推荐文章于 2022-10-21 15:48:10 发布