Python实现的解析网页看报程序

功能与前一篇相同,主要是进行了一些函数封装,添加了多线程提高性能
# coding=gbk
import urllib2
import socket
import re
import time
import os
import threading
import time
        
def ReadPageContent(url):
    page_content = ""
    try:
        page_open = urllib2.urlopen(url)
        page_content = page_open.read()


        print "Read page succeed!\n",
    except:
        print "Exception: Read page failed!\n",
        return ""
    return page_content


def MatchReg(reg_str, content):
    compiles_reg = re.compile(reg_str)
    find_result = compiles_reg.findall(content)
    return find_result


def CreateStoreFolder():
    image_folder = "E:\\news_folder\\"
    if (os.path.exists(image_folder) == False):
        os.makedirs(image_folder)
    today_num = time.strftime('%Y-%m-%d',time.localtime(time.time()))
    image_folder = image_folder + today_num + "\\"
    if (os.path.exists(image_folder) == False):
        os.makedirs(image_folder)
    print "News image folder = " + image_folder  + "\n",
    return image_folder


def GetNewsPageCount(content):
    tot_page_index = content.find("共")
    if tot_page_index < 0:
        print "Cannot find the page count string, return!\n",
        return 0
    tmp_str = content[tot_page_index:-1]
    end_index = tmp_str.find("页")
    if end_index < 0:
        print "Cannot find the page count string, return!\n",
        return 0
    
    page_num = tmp_str[2:end_index]
    print "Get news page count = " + page_num + "\n",


    page_count = int(page_num)
    return page_count


class MThreadReader(threading.Thread):
    def __init__(self, index, page_url, folder):
        threading.Thread.__init__(self)
        self.thread_index = index
        self.thread_stop = False
        self.page_url = page_url
        self.folder = folder
        print "Thread " + str(self.thread_index) + ": " + self.page_url + "\n",
        
    def run(self):
        thread_title = "Thread " + str(self.thread_index) + ": "
        image_page_content = ReadPageContent(self.page_url)
        if (image_page_content == ""):
            print thread_title + "Read new page failed. Thread exit T_T\n",
            exit()
            
        reg_str = r'http://i\S+jpg'
        image_results = MatchReg(reg_str, image_page_content)
        if len(image_results) == 0:
            print thread_title + "Cannot find news image" + str(self.thread_index) + "!\n",
            return


        image_url = image_results[0]
        
        news_image_open = urllib2.urlopen(image_url)


        image_name = self.folder + "page_" + str(self.thread_index) + ".jpg"
        imgfile = open(image_name, 'wb')
        print thread_title + "Downloadding image...\n",
        try:
            while True:
                data = news_image_open.read(1024*10)
                if not data:
                    break
                imgfile.write(data)
            imgfile.close()
        except:
            print thread_title + "Downloadding image " + str(self.thread_index) + " failed!\n",
            print thread_title + "Unexpected error: " + sys.exc_info()[0] + sys.exc_info()[1]  + "\n",
        else:
            print thread_title + "Download news image " + str(self.thread_index) + " succeed!\n",
        
    def stop(self):
        self.thread_stop = True
       
        
if __name__ == '__main__':
    # set default timeout time
    timeout = 15
    urllib2.socket.setdefaulttimeout(timeout)


    # get the home page content
    home_url = "http://www.hqck.net"
    home_page_content = ReadPageContent(home_url)


    # get the latest news link
    reg_str = r'<a class="item-baozhi" href="/arc/jwbt/ckxx/\d{4}/\d{4}/\w+\.html"><span class.+>.+</span></a>'


    match_results = MatchReg(reg_str, home_page_content)
    if len(match_results) <= 0:
        print "Cannot match the reg\n",
        exit()


    today_news = match_results[0] 
    for link in match_results:
        if (link.find('参考')):
            today_news = link
    print "Latest news link = " + today_news + "\n",


    url_s = today_news.find("/arc/")
    if (url_s < 0):
        print "Get the news link failed! T_T...\n",
        exit()
        
    url_e = today_news.find(".html")
    if (url_s < 0):
        print "Get the news link failed! T_T...\n",
        exit()
    url_e = url_e + 5  


    print "Link index = [" + str(url_s) + "," + str(url_e) + "]"
    part_news_uri = today_news[url_s:url_e]
    print "News part url = " + part_news_uri + "\n",


    full_news_url = home_url + part_news_uri
    print "News full url = " + full_news_url + "\n",
    print


    # create the image folder
    image_folder = CreateStoreFolder()


    # get the new page count
    context_uri = full_news_url[0:-5]
    first_page_url = context_uri + ".html"
    first_page_content = ReadPageContent(first_page_url)
    news_page_count = GetNewsPageCount(first_page_content)


    print "Total " + str(news_page_count) + " pages:\n",
    print
    if news_page_count <= 0:
        print "Get news page count failed.\n",
        exit()


    # get the news page link
    url_arr = []
    page_index = 1
    while page_index <= news_page_count:
        page_url = context_uri
        if page_index > 1:
            page_url = page_url + "_" + str(page_index)
        page_url = page_url + ".html"
        url_arr.append(page_url)
        page_index = page_index + 1


    index = 1
    reader_arr = []
    for u in url_arr:
        reader_arr.append(MThreadReader(index, u, image_folder))
        index = index + 1


    
    for reader in reader_arr:
        reader.start()


    for reader in reader_arr:
        reader.join()
        
    print "Enjoy it^^! (" + str(image_folder) + ")\n",
    print "=============================================================\n",
    ans = raw_input("Read news right now? (Y/N): ")
    if(ans != "N" and ans !="n"):
        os.startfile(image_folder)
    #raw_input("Press Enter to exit...")
    


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
资源包主要包含以下内容: ASP项目源码:每个资源包中都包含完整的ASP项目源码,这些源码采用了经典的ASP技术开发,结构清晰、注释详细,帮助用户轻松理解整个项目的逻辑和实现方式。通过这些源码,用户可以学习到ASP的基本语法、服务器端脚本编写方法、数据库操作、用户权限管理等关键技术。 数据库设计文件:为了方便用户更好地理解系统的后台逻辑,每个项目中都附带了完整的数据库设计文件。这些文件通常包括数据库结构图、数据表设计文档,以及示例数据SQL脚本。用户可以通过这些文件快速搭建项目所需的数据库环境,并了解各个数据表之间的关系和作用。 详细的开发文档:每个资源包都附有详细的开发文档,文档内容包括项目背景介绍、功能模块说明、系统流程图、用户界面设计以及关键代码解析等。这些文档为用户提供了深入的学习材料,使得即便是从零开始的开发者也能逐步掌握项目开发的全过程。 项目演示与使用指南:为帮助用户更好地理解和使用这些ASP项目,每个资源包中都包含项目的演示文件和使用指南。演示文件通常以视频或图文形式展示项目的主要功能和操作流程,使用指南则详细说明了如何配置开发环境、部署项目以及常见问题的解决方法。 毕业设计参考:对于正在准备毕业设计的学生来说,这些资源包是绝佳的参考材料。每个项目不仅功能完善、结构清晰,还符合常见的毕业设计要求和标准。通过这些项目,学生可以学习到如何从零开始构建一个完整的Web系统,并积累丰富的项目经验。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值