- #coding:utf-8
- import urllib.request
- import xml.dom.minidom
- import sqlite3
- import threading
- import time
- class logger(object):
- def log(self,*msg):
- for i in msg:
- print(i)
- Log = logger()
- Log.log('测试下')
- class downloader(object):
- def __init__(self,url):
- self.url = url
- def download(self):
- Log.log('开始下载',self.url)
- try:
- content = urllib.request.urlopen(self.url).read()
- #req = urllib.request.Request(url)
- #response = urllib.request.urlopen(req)
- #content = response.read()
- Log.log('下载完毕')
- return(content)
- except:
- Log.log('下载出错')
- return(None)
- class parser(object):
- def __init__(self,content):
- #获得根节点
- self.html = xml.dom.minidom.parseString(content)
- def parse(self):
- Log.log('开始提取数据')
- contents = {'content':'','url':[]}
- #获得div节点
- divs = self.html.getElementsByTagName('div')
- #获得content节点
- for div in divs:
- if div.hasAttribute('class') and /
- div.getAttribute('class') == 'content':
- #获得糗事百科的内容
- textNode = div.childNodes[0]
- qContent = textNode.data
- #数据填充
- contents['content'] = qContent
- #获得上一糗事、下一糗事节点
- spans = self.html.getElementsByTagName('span')
- for span in spans:
- pspan = span.parentNode
- if pspan.tagName == 'a':
- #pspan为对应的链接,此时需要将对应的地址加入数据库
- url = pspan.getAttribute('href')
- qid = url[10:][:-4]
- #数据填充
- contents['url'].append(qid)
- Log.log('提取数据完毕')
- return(contents)
- def downloadPage(qid,db):
- url = 'http://www.qiushibaike.com/articles/'+str(qid)+'.htm'
- content = downloader(url).download()
- if content:
- contents = parser(content).parse()
- if contents['content']:
- db.updateContent(qid,contents['content'])
- for i in contents['url']:
- db.addQID(i)
- if len(contents['url']) == 2:
- db.updateStatus(qid,2)
- #下载池,表示同时允许下载的链接个数
- class downloaderPool(object):
- def __init__(self,maxLength=15):
- self.downloaders = [None]*maxLength
- self.downloadList = []
- self.db = None
- def setDownloadList(self,downloadList):
- self.downloadList = list(set(self.downloadList+downloadList))
- def setdb(self,db):
- self.db = db
- def daemon(self):
- #每隔一秒查询线程的状态,为非活动线程则设置为None
- Log.log('设置守护进程')
- for index,downloader in enumerate(self.downloaders):
- if downloader:
- if not downloader.isAlive():
- Log.log('将下载器置空',index)
- self.downloaders[index] = None
- #检查线程池状态
- for index,downloader in enumerate(self.downloaders):
- if not downloader:
- qid = self.getQID()
- if qid:
- #创建线程
- t = threading.Thread(target=downloadPage,args=(qid,self.db))
- self.downloaders[index] = t
- t.start()
- t.join()
- Log.log('设置下载器',index)
- #间隔一秒执行一次
- time.sleep(1)
- def getQID(self):
- try:
- tmp = self.downloadList[0]
- del self.downloadList[0]
- return(tmp)
- except:
- return(None)
- def beginDownload(self):
- #创建守护线程
- daemon = threading.Thread(target=self.daemon)
- daemon.setDaemon(True)
- daemon.start()
- daemon.join()
- def getDownloader(self):
- for index,downloader in enumerate(self.downloaders):
- if not downloader:
- return(index)
- return(None)
- ADD_Q_ID = 'insert into qiushibaike(id,success) values(?,?)'
- UPDATE_Q_CONTENT = 'update qiushibaike set content=? where id=?'
- UPDATE_Q_STATUS = 'update qiushibaike set success=? where id=?'
- Q_LIST = 'select id from qiushibaike where success=?'
- Q_LIST_BY_ID = 'select count(*) from qiushibaike where id=?'
- class dbConnect(object):
- """
- create table qiushibaike(
- id,Integer
- content,Varchar
- success,Interger
- )
- #id表示糗事的ID
- #content表示糗事的内容
- #success表示是否下载成功,当该糗事内容下载完成,且获得上一页、下一页ID时表示下载完成
- 1表示未完成
- 2表示完成
- """
- def __init__(self,dbpath='db.sqlite'):
- self.dbpath = dbpath
- def addQID(self,qid):
- Log.log('插入糗事百科',qid)
- #获得连接
- cn = sqlite3.connect(self.dbpath)
- c = cn.cursor()
- try:
- #添加内容并提交
- c.execute(ADD_Q_ID,(qid,1))
- cn.commit()
- except:
- Log.log('添加ID出错',qid)
- #关闭连接
- c.close()
- cn.close()
- Log.log('插入成功')
- def updateContent(self,qid,content):
- Log.log('更新糗事百科',qid,content)
- #获得连接
- cn = sqlite3.connect(self.dbpath)
- c = cn.cursor()
- #添加内容并提交
- c.execute(UPDATE_Q_CONTENT,(content,qid))
- cn.commit()
- #关闭连接
- c.close()
- cn.close()
- Log.log('更新成功')
- def updateStatus(self,qid,flag):
- Log.log('更新状态',qid,flag)
- #获得连接
- cn = sqlite3.connect(self.dbpath)
- c = cn.cursor()
- #添加内容并提交
- c.execute(UPDATE_Q_STATUS,(flag,qid))
- cn.commit()
- #关闭连接
- c.close()
- cn.close()
- Log.log('更新状态成功')
- def getList(self,unDonloaded=1):
- Log.log('获得列表')
- l = []
- #获得连接
- cn = sqlite3.connect(self.dbpath)
- c = cn.cursor()
- #获得数据
- c.execute(Q_LIST,(unDonloaded,))
- rows = c.fetchall()
- for i in rows:
- l.append(i[0])
- #关闭连接
- c.close()
- cn.close()
- Log.log('获得列表成功')
- return(l)
- class singleDownloader(object):
- def __init__(self):
- self.downloadList = []
- def setdb(self,db):
- self.db = db
- def setDownloadList(self,downloadList):
- self.downloadList = list(set(self.downloadList+downloadList))
- def beginDownload(self):
- for i in self.downloadList:
- downloadPage(i,self.db)
- def main():
- db = dbConnect('db.sqlite')
- #dp = downloaderPool()
- #dp.setdb(db)
- sp = singleDownloader()
- sp.setdb(db)
- dp=sp
- unDownloadedList = db.getList()
- #当还有未下载的糗事时就要继续下载
- while(len(unDownloadedList)):
- #使用该列表填充下载池
- dp.setDownloadList(unDownloadedList)
- dp.beginDownload()
- time.sleep(1)
- #重置参数
- unDownloadedList = db.getList()
- if __name__ == '__main__':
- main()
1、多线程下载
2、代码分离度更高,跟面向对象
各位看家有什么好想法,贴出来看看。