爬虫主方法
import sys
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
import time
import random
import re
import os
import threading
import chardet
count1 = 0
lock = threading.Lock()
def get_result(ybcode, page=1):
try:
data = {'ybcode': ybcode, 'entrycode': '', 'page': page, 'pagerow': '20',
'Referer': 'http://data.cnki.net/Yearbook'}
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Cookie': 'Ecp_ClientId=2201106155502682665; Ecp_LoginStuts={"IsAutoLogin":false,"UserName":"sh0292",'
'"ShowName":"%e4%b8%ad%e5%9b%bd%e7%9f%bf%e4%b8%9a%e5%a4%a7%e5%ad%a6%e5%9b%be%e4%b9%a6%e9%a6%86",'
'"UserType":"bk","BUserName":"","BShowName":"","BUserType":"","r":"R7eKrF"}; '
'c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqeVpQWEhjK2JqNWVTcFpPTFJSTFVLMnUxWGMyQT0'
'=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=11/06/2020 17:39:14; '
'LID=WEEvREcwSlJHSldSdmVqeVpQWEhjK2JqNWVTcFpPTFJSTFVLMnUxWGMyQT0'
'=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2020-11-06 17:39:14; '
'Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1604650989,1604651116,1604651168,1604654428; '
'ASP.NET_SessionId=3d0xpwff2pt0exxcclmw3we4; SID=009023; '
'Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1604654428'
,
'Referer': 'https://login.cnki.net/login/?platform=kns&ForceReLogin=1&ReturnURL=https://www.cnki.net/',
}
url = "https://data.cnki.net/Yearbook/PartialGetCatalogResult"
params = urllib.parse.urlencode(data).encode(encoding='utf-8')
req = urllib.request.Request(url, params, headers)
r = urllib.request.urlopen(req)
res = str(r.read(), 'utf-8')
return res
except Exception as e:
sys.exit(0)
def get_pageno(ybcode):
soup = BeautifulSoup(get_result(ybcode), 'lxml')
pages = int(soup.select('.s_p_listl')[0].get_text().split("共")[2].split('页')[0])
print('总共' + str(pages) + '页')
return pages
def dataclear(data):
data = re.sub('\n+', '', data)
data = re.sub('\r+', '', data)
data = re.sub(' +', '', data)
data = re.sub('>+', '-', data)
return data
def filedata(yearBook, yearBookName):
ybcode = yearBook.get('ybcode')
dictionaryName = os.getcwd() + '/' + yearBook.get('year') + yearBookName
dictionaryName = dataclear(dictionaryName)
pageno = get_pageno(ybcode)
print(os.getcwd())
if os.path.isdir(dictionaryName) == 0:
os.mkdir(dictionaryName)
os.chdir(dictionaryName)
for i in range(1, pageno + 1, 1):
print('######当前第' + str(i) + '页######')
soup = BeautifulSoup(get_result(ybcode, i), 'lxml')
for j in soup.select('tr'):
s = BeautifulSoup(str(j), 'lxml')
if len(s.select('img[src="/resources/design/images/nS_down2.png"]')) == 0:
pass
else:
try:
if len(BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(3) > a')) >= 1:
title = str(BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(1) > a')[0].get_text())
url = 'http://data.cnki.net' + BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(3) > a')[
1].get('href')
code = BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(3) > a')[
1].get('href').split("=")[1]
title = dataclear(title)
if not os.path.isfile(dictionaryName + '/' + title + '.xls'):
if '附录' not in title:
time.sleep(random.random() * 4 + 8)
print(filedown(title, url, code))
else:
print('已存在:' + title)
except Exception as e:
print('error:-------------------' + str(e))
sys.exit(0)
os.chdir(os.path.abspath(os.path.dirname(os.getcwd())))
def count():
global count1
count1 = count1 + 1
print('=====已下载:' + str(count1) + '个')
def filedown(title, url, code):
path = os.getcwd()
for file in os.listdir(path):
if code in file:
print("文件已存在")
olddir = os.path.join(path, file)
newdir = os.path.join(path, title + '.xls')
print(olddir)
print(newdir)
os.rename(olddir, newdir)
print('重命名:' + title)
count()
return "已完成"
global browser
options = webdriver.ChromeOptions()
prefs = {'profile.default_content_settings.popups': 0,
'download.default_directory': os.getcwd()}
options.add_experimental_option('prefs', prefs)
options.add_argument('headless')
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"
browser = webdriver.Chrome(desired_capabilities=desired_capabilities, options=options)
wait = WebDriverWait(browser, 10)
try:
browser.get(url)
wait.until(EC.presence_of_element_located((By.ID, 'Button2')))
loginButton = browser.find_element_by_id('Button2')
loginButton.click()
countdown = 15
while code not in "".join(os.listdir(path)):
time.sleep(1)
countdown = countdown - 1
print("=====倒计时:" + str(countdown))
if countdown < 1:
browser.quit()
return "下载失败:超时"
else:
for file in os.listdir(path):
if code in file:
olddir = os.path.join(path, file)
newdir = os.path.join(path, title + '.xls')
print(olddir)
print(newdir)
if not os.path.isfile(newdir):
os.rename(olddir, newdir)
browser.quit()
count()
return '下载完成,重命名:' + title
else:
browser.quit()
return '文件已存在'
print('不应该运行到这里')
browser.quit()
sys.exit(0)
except Exception as e:
print(e)
browser.quit()
sys.exit(0)
def spider():
yearBooksName = '中国能源统计年鉴'
dictionaryName = os.getcwd() + '/' + yearBooksName
if os.path.isdir(dictionaryName) == 0:
os.mkdir(dictionaryName)
os.chdir(dictionaryName)
yearBooks = [
{'ybcode': 'N2014030143', 'year': '2013年'},
{'ybcode': 'N2013020081', 'year': '2012年'},
{'ybcode': 'N2005120869', 'year': '1991-1996年'},
{'ybcode': 'N2005120761', 'year': '1989年'},
{'ybcode': 'N2006010708', 'year': '1986年'},
]
for yearBook in yearBooks:
filedata(yearBook, yearBooksName)
if __name__ == '__main__':
spider()
持久化运行,监测爬虫程序运行状态,如果停止了自动重新开始,直接调用cmd,用绝对路径运行爬虫,并将日志输出获取。
import os
import subprocess
import logging
from logging.handlers import RotatingFileHandler
LOG_PATH_FILE = "D:\pyCharm\studyTool\my_service_mgr.log"
LOG_MODE = 'a'
LOG_MAX_SIZE = 10 * 1024 * 1024
LOG_MAX_FILES = 10
LOG_LEVEL = logging.DEBUG
LOG_FORMAT = "%(asctime)s %(levelname)-10s[%(filename)s:%(lineno)d(%(funcName)s)] %(message)s"
handler = RotatingFileHandler(LOG_PATH_FILE, LOG_MODE, LOG_MAX_SIZE, LOG_MAX_FILES)
formatter = logging.Formatter(LOG_FORMAT)
handler.setFormatter(formatter)
Logger = logging.getLogger()
Logger.setLevel(LOG_LEVEL)
Logger.addHandler(handler)
pid = os.getpid()
def print_error(s):
print('\033[31m[%d: ERROR] %s\033[31;m' % (pid, s))
def print_info(s):
print( '\033[32m[%d: INFO] %s\033[32;m' % (pid, s))
def print_warning(s):
print ('\033[33m[%d: WARNING] %s\033[33;m' % (pid, s))
def start_child_proc(command, merged):
try:
if command is None:
raise (OSError, "Invalid command")
child = None
if merged is True:
child = subprocess.Popen(command)
else:
child = subprocess.Popen(command)
return child
except subprocess.CalledProcessError:
pass
except OSError:
raise (OSError, "Failed to run command!")
def run_forever(command):
print_info("start child process with command: " + ' '.join(command))
Logger.info("start child process with command: " + ' '.join(command))
merged = False
child = start_child_proc(command, merged)
failover = 0
while True:
while child.poll() != None:
failover = failover + 1
print_warning("child process shutdown with return code: " + str(child.returncode))
Logger.critical("child process shutdown with return code: " + str(child.returncode))
print_warning("restart child process again, times=%d" % failover)
Logger.info("restart child process again, times=%d" % failover)
child = start_child_proc(command, merged)
out, err = child.communicate()
returncode = child.returncode
if returncode != 0:
for errorline in err.slitlines():
Logger.info(errorline)
else:
Logger.info("execute child process failed")
Logger.exception("!!!should never run to this!!!")
if __name__ == "__main__":
cmd = 'py D:\pyCharm\studyTool\cnkiCrawler.py'
run_forever(cmd)