功能与前一篇相同,主要是进行了一些函数封装,添加了多线程提高性能
# coding=gbk
import urllib2
import socket
import re
import time
import os
import threading
import time
def ReadPageContent(url):
page_content = ""
try:
page_open = urllib2.urlopen(url)
page_content = page_open.read()
print "Read page succeed!\n",
except:
print "Exception: Read page failed!\n",
return ""
return page_content
def MatchReg(reg_str, content):
compiles_reg = re.compile(reg_str)
find_result = compiles_reg.findall(content)
return find_result
def CreateStoreFolder():
image_folder = "E:\\news_folder\\"
if (os.path.exists(image_folder) == False):
os.makedirs(image_folder)
today_num = time.strftime('%Y-%m-%d',time.localtime(time.time()))
image_folder = image_folder + today_num + "\\"
if (os.path.exists(image_folder) == False):
os.makedirs(image_folder)
print "News image folder = " + image_folder + "\n",
return image_folder
def GetNewsPageCount(content):
tot_page_index = content.find("共")
if tot_page_index < 0:
print "Cannot find the page count string, return!\n",
return 0
tmp_str = content[tot_page_index:-1]
end_index = tmp_str.find("页")
if end_index < 0:
print "Cannot find the page count string, return!\n",
return 0
page_num = tmp_str[2:end_index]
print "Get news page count = " + page_num + "\n",
page_count = int(page_num)
return page_count
class MThreadReader(threading.Thread):
def __init__(self, index, page_url, folder):
threading.Thread.__init__(self)
self.thread_index = index
self.thread_stop = False
self.page_url = page_url
self.folder = folder
print "Thread " + str(self.thread_index) + ": " + self.page_url + "\n",
def run(self):
thread_title = "Thread " + str(self.thread_index) + ": "
image_page_content = ReadPageContent(self.page_url)
if (image_page_content == ""):
print thread_title + "Read new page failed. Thread exit T_T\n",
exit()
reg_str = r'http://i\S+jpg'
image_results = MatchReg(reg_str, image_page_content)
if len(image_results) == 0:
print thread_title + "Cannot find news image" + str(self.thread_index) + "!\n",
return
image_url = image_results[0]
news_image_open = urllib2.urlopen(image_url)
image_name = self.folder + "page_" + str(self.thread_index) + ".jpg"
imgfile = open(image_name, 'wb')
print thread_title + "Downloadding image...\n",
try:
while True:
data = news_image_open.read(1024*10)
if not data:
break
imgfile.write(data)
imgfile.close()
except:
print thread_title + "Downloadding image " + str(self.thread_index) + " failed!\n",
print thread_title + "Unexpected error: " + sys.exc_info()[0] + sys.exc_info()[1] + "\n",
else:
print thread_title + "Download news image " + str(self.thread_index) + " succeed!\n",
def stop(self):
self.thread_stop = True
if __name__ == '__main__':
# set default timeout time
timeout = 15
urllib2.socket.setdefaulttimeout(timeout)
# get the home page content
home_url = "http://www.hqck.net"
home_page_content = ReadPageContent(home_url)
# get the latest news link
reg_str = r'<a class="item-baozhi" href="/arc/jwbt/ckxx/\d{4}/\d{4}/\w+\.html"><span class.+>.+</span></a>'
match_results = MatchReg(reg_str, home_page_content)
if len(match_results) <= 0:
print "Cannot match the reg\n",
exit()
today_news = match_results[0]
for link in match_results:
if (link.find('参考')):
today_news = link
print "Latest news link = " + today_news + "\n",
url_s = today_news.find("/arc/")
if (url_s < 0):
print "Get the news link failed! T_T...\n",
exit()
url_e = today_news.find(".html")
if (url_s < 0):
print "Get the news link failed! T_T...\n",
exit()
url_e = url_e + 5
print "Link index = [" + str(url_s) + "," + str(url_e) + "]"
part_news_uri = today_news[url_s:url_e]
print "News part url = " + part_news_uri + "\n",
full_news_url = home_url + part_news_uri
print "News full url = " + full_news_url + "\n",
print
# create the image folder
image_folder = CreateStoreFolder()
# get the new page count
context_uri = full_news_url[0:-5]
first_page_url = context_uri + ".html"
first_page_content = ReadPageContent(first_page_url)
news_page_count = GetNewsPageCount(first_page_content)
print "Total " + str(news_page_count) + " pages:\n",
print
if news_page_count <= 0:
print "Get news page count failed.\n",
exit()
# get the news page link
url_arr = []
page_index = 1
while page_index <= news_page_count:
page_url = context_uri
if page_index > 1:
page_url = page_url + "_" + str(page_index)
page_url = page_url + ".html"
url_arr.append(page_url)
page_index = page_index + 1
index = 1
reader_arr = []
for u in url_arr:
reader_arr.append(MThreadReader(index, u, image_folder))
index = index + 1
for reader in reader_arr:
reader.start()
for reader in reader_arr:
reader.join()
print "Enjoy it^^! (" + str(image_folder) + ")\n",
print "=============================================================\n",
ans = raw_input("Read news right now? (Y/N): ")
if(ans != "N" and ans !="n"):
os.startfile(image_folder)
#raw_input("Press Enter to exit...")