完整版代码github地址:https://github.com/Monster2848/caipanwenshu
爬虫主体逻辑:
import re
import time
import pymongo
import requests
import execjs
import threadpool
from wenshu_task.docid import getkey, decode_docid
from wenshu_task.my_logger import logger
from wenshu_task.redis_ip_pool import RedisPara
from wenshu_task.wenshu_method import ParseJs, ParseDetail, Para
from wenshu_task.wenshu_setting import ExceptionCollections, ThreadNum, MongoSetting
'''
文书网爬虫:http://wenshu.court.gov.cn/
'''
class NewWenshu(object):
'''裁判文书网'''
def __init__(self,page,case_type,get_ua, get_pr):
self.ua = get_ua
self.proxies = {'http': 'http://{}'.format(get_pr)}
self.page = page
self.case_type = case_type
self.item = {}
self.log = logger()
self.data_count = 200
self.order = "法院层级"
self.direction = "asc"
# 首页第一次
def home_1(self):
url = 'http://wenshu.court.gov.cn/'
resp = requests.get(
url=url,
headers={
"User-Agent": self.ua,
},
proxies=self.proxies,
allow_redirects=False,
timeout=20
)
html_js = resp.text
try:
dynamicurl = re.search('dynamicurl="(.*?)"', html_js).group(1)
wzwsquestion = re.search('wzwsquestion="(.*?)"', html_js).group(1)
wzwsfactor = re.search('wzwsfactor="(.*?)"', html_js).group(1)
wzwsmethod = re.search('wzwsmethod="(.*?)"', html_js).group(1)
wzwsparams = re.search('wzwsparams="(.*?)"', html_js).group(1)
except:
return None
para_part = '''
var dynamicurl="{}";var wzwsquestion="{}";var wzwsfactor="{}";var wzwsmethod="{}";var wzwsparams="{}";
'''.format(dynamicurl,wzwsquestion,wzwsfactor,wzwsmethod,wzwsparams)
with open('home_1.js','r',re.DOTALL) as f:
js_code = f.read()
js_code = para_part + js_code
ctx = execjs.compile(js_code)
wzwschallenge = ctx.call("wzwschallenge")
next_url = 'http://wenshu.court.gov.cn' + dynamicurl + '?' + 'wzwschallenge=' + wzwschallenge
wzws_cid = requests.utils.dict_from_cookiejar(resp.cookies).get("wzws_cid")
return next_url,wzws_cid
# 首页第二次
def home_2(self):
box = self.home_1()
if not box:
return None
next_url, wzws_cid = box
url = next_url
resp = requests.get(
url=url,
headers={
"User-Agent": self.ua,
},
proxies=self.proxies,
allow_redirects=False,
timeout=20,
cookies ={
"wzws_cid": wzws_cid
}
)
next_wzws_cid = requests.utils.dict_from_cookiejar(resp.cookies).get("wzws_cid")
return next_wzws_cid
# 列表页第一次
def list_1(self):
box = self.home_2()
if not box:
return None
next_wzws_cid = box
url = "http://wenshu.court.gov.cn/List/List"
resp = requests.get(
url=url,
headers={
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",