import urllib2
from lxml import etree
import Queue
import ssl
import re
import threading
import json
class CrawlThread(threading.Thread):
def __init__(self, urlQueue, dataQueue, threadName):
super(CrawlThread, self).__init__()
self.urlQueue = urlQueue
self.dataQueue = dataQueue
self.name = threadName
def run(self):
while not urlQueue.empty():
try:
dict = self.urlQueue.get(block=False)
suffix_url = dict.get('url')
filename = dict.get('filename')
print threading.current_thread().name + '\t' + filename
response = opener.open(url + suffix_url)
html = response.read().decode('gbk')
# content = etree.HTML(html)
# text = content.xpath('//*[@id="content"]')[0].text
pattern = re.compile(r'<div id="content">(.*?)</div>')
text = pattern.search(html).group(1)
self.dataQueue.put({'filename': filename, 'content': text})
except:
pass
class WriteThread(threading.Thread):
def __init__(self, dataQueue, threadName):
super(WriteThread, self).__init__()
self.name = threadName
self.dataQueue = dataQueue
def run(self):
while not dataQueue.empty():
try:
dict = dataQueue.get(block=False)
filename = dict.get('filename')
content = dict.get('content')
# print content
with open('./novel/' + filename.strip() + '.txt', 'w') as f:
r = doContent(content)
f.write(r.encode('utf-8'))
except Exception, e:
print e.message
def doContent(content):
pattern = re.compile(r'<br/><br/> ')
r = pattern.sub('\n', content)
return r
if __name__ == "__main__":
proxy = {
'http': '***',
'https': '***'
}
ssl_context = ssl._create_unverified_context()
https_handler = urllib2.HTTPSHandler(context=ssl_context)
global url
url = "https://www.i7wx.com/book/0/636/"
proxy_handler = urllib2.ProxyHandler(proxy)
opener = urllib2.build_opener(proxy_handler, https_handler)
response = opener.open(url)
# print response.read().decode('gbk')
pattern = re.compile(r'<a href="(\d*.html)">(.*?)</a>', re.I)
result = pattern.findall(response.read().decode('gbk'))
urlQueue = Queue.Queue()
dataQueue = Queue.Queue()
global CRAWl_EXIT, SAVE_EXIT
CRAWl_EXIT = False
SAVE_EXIT = False
for k, v in result:
# print k, v
urlQueue.put({
'url': k,
'filename': v
})
crawlThreads = []
thread = CrawlThread(urlQueue, dataQueue, "crawl thread 1")
thread2 = CrawlThread(urlQueue, dataQueue, "crawl thread 2")
thread3 = CrawlThread(urlQueue, dataQueue, "crawl thread 3")
crawlThreads.append(thread)
crawlThreads.append(thread2)
crawlThreads.append(thread3)
thread.start()
thread2.start()
thread3.start()
for t in crawlThreads:
t.join()
writeThreads = []
thread4 = WriteThread(dataQueue, 't4')
thread5 = WriteThread(dataQueue, 't5')
thread6 = WriteThread(dataQueue, 't6')
writeThreads.append(thread4)
writeThreads.append(thread5)
writeThreads.append(thread6)
for t in writeThreads:
t.start()
for t in writeThreads:
t.join()
实现python自定义爬虫框架
最新推荐文章于 2024-05-15 00:51:17 发布