python网络蜘蛛

python编写的简单的网络蜘蛛,使用了chardet库来解码,其中的关键技术为多线程控制时间、正则表达式解析html

import re
import urllib.request
import chardet
import _thread
import time


def time_control():
    global TIME_C
    while 1:
        time.sleep(1)
        if time.time() - TIME_C > 10:
            TIME_C = time.time()
            _thread.interrupt_main()
def put_in(from_url):
    global S, url_list, TIME_C
    try:
        TIME_C = time.time()
        temp=urllib.request.urlopen(from_url)
        if temp.status == 200:
            ts = temp.read()
            TIME_C = time.time()
            S = ts.decode(chardet.detect(ts)['encoding'], 'replace')
            pattern = re.compile(r'"https?://[^\b\n<>"]*?pku.edu.cn')
            NUM = 0
            TIME_C = time.time()
            g = pattern.search(S[NUM:])
            while True:
                TIME_C = time.time()
                g = pattern.search(S[NUM:len(S)])
                if g == None:
                    break
                NUM += g.span()[1]
                if url_list.count(g.group(0)[1:len(g.group(0))]) == 0:
                    url_list.append(g.group(0)[1:len(g.group(0))])
                    print(g.group(0)[1:len(g.group(0))])
    except KeyboardInterrupt:
        TIME_C = time.time()
        print("\nTime out\n")
        return
    except TimeoutError:
        print("\nTime out\n")
        return
    except urllib.error.URLError:
        return


def ergodic_list(start_point):
    global url_list
    new_start_point = len(url_list)
    for i in url_list[start_point:]:
        print("\nfrom "+i+"\n"+"-"*10)
        put_in(i)
    if len(url_list) > new_start_point:
        ergodic_list(new_start_point)


if __name__ == '__main__':
    TIME_C = time.time()
    _thread.start_new_thread(time_control,())
    url_list = ['http://www.pku.edu.cn/sitemap/bzdt.html']
    S = ''
    ergodic_list(0)

                
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值