#coding=utf-8
importosimportrefrom selenium importwebdriverfrom selenium.common.exceptions importTimeoutExceptionimportselenium.webdriver.support.ui as uiimporttimefrom datetime importdatetimefrom selenium.webdriver.common.action_chains importActionChains#from threading import Thread
from pyquery importPyQuery as pqimportLogFileimporturllibclassdownfile(object):def __init__(self,websearch_url,novelname):
self.driver=webdriver.PhantomJS()#self.driver.set_page_load_timeout(10)
self.driver.maximize_window()
novel_name= unicode(novelname,‘utf8‘)
logfile= os.path.join(os.getcwd(), ‘novel\\‘ + novel_name + ‘.txt‘)
self.log=LogFile.LogFile(logfile)
self.websearch_url=websearch_urldefscroll_foot(self):‘‘‘滚动条拉到底部
:return:‘‘‘js= ""
#如何利用chrome驱动或phantomjs抓取
if self.driver.name == "chrome" or self.driver.name == ‘phantomjs‘:
js= "var q=document.body.scrollTop=10000"
#如何利用IE驱动抓取
elif self.driver.name == ‘internet explorer‘:
js= "var q=document.documentElement.scrollTop=10000"
returnself.driver.execute_script(js)defscrapy_date(self):
self.driver.get( self.websearch_url)
htext= self.driver.execute_script("return document.documentElement.outerHTML")
dochtml=pq(htext)
Elements= dochtml(‘div[class="novel_list"]‘).find(‘ul‘).find(‘li‘).find(‘a‘)for e inElements.items():
url= ‘http://www.shanxixsa.com/sxtvi/21/21051/‘+e.attr(‘href‘)
txt= e.text().encode(‘utf8‘).strip()printtxt
self.log.WriteLog(txt)
self.driver.get(url)
shtext= self.driver.execute_script("return document.documentElement.outerHTML")
sdochtml=pq(shtext)
sElements= sdochtml(‘div[ID="novel_content"]‘)for se insElements.items():
stxt= se.text().encode(‘utf8‘).strip()
self.log.WriteLog(stxt)
obj= downfile(‘http://www.shanxixsa.com/sxtvi/21/21051/index.html‘,‘官场风月‘)
obj.scrapy_date()#-*- coding: utf-8 -*-
importosimportcodecsimportdatetimeimporttimeimportlogging#封装logging日志
classLogFile:#def __init__(self,fileName):
#self.fileName = os.path.join(os.getcwd(), fileName)
#def WriteLog(self,message):
#strMessage = ‘\r\n%s: %s‘ % (time.strftime(‘%Y-%m-%d_%H-%M-%S‘), message)
#with open(self.fileName, ‘a‘) as f:
#f.write(strMessage)
#构造函数 fileName:文件名
def __init__(self,fileName,level=logging.INFO):
fh=logging.FileHandler(fileName)
self.logger=logging.getLogger()
self.logger.setLevel(level)#formatter = logging.Formatter(‘%(asctime)s : %(message)s‘,‘%Y-%m-%d %H:%M:%S‘)
formatter = logging.Formatter(‘%(message)s‘, ‘%Y-%m-%d %H:%M:%S‘)
fh.setFormatter(formatter)
self.logger.addHandler(fh)defWriteLog(self,message):
self.logger.info(message)defWriteErrorLog(self,message):
self.logger.setLevel(logging.ERROR)
self.logger.error(message)