# coding: UTF-8
import urllib
import re
import sys
import os
import threading
from time import ctime
#sys.stderr = None
#获取每页URL,返回当前页URL 列表
def getUrl(url):
page = urllib.urlopen(url)
srcHtml = page.read()
reg = '([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})/.*?'
aListUrl = re.findall(reg,srcHtml)
for index in range(len(aListUrl)):
reg = '[|]'
ok = re.sub(reg,'',aListUrl[index])
if ok:
aListUrl[index] = ok
return aListUrl
#拼装所有请求页URL地址 "Codier's Blog" "30"
def getPage(search,num,threadNum):
pageList = []
for index in range(0,num):
strBuff = "http://www.baidu.com/s?wd="+search+"&pn="
nowPage = index*10
strBuff = strBuff + str(nowPage)
pageList.append(strBuff)
return pageList,threadNum
#封装多线程对象列表
def creMultiThread(allPageList,threadNum=2):
if len(allPageList)
getPageListUrl(allPageList)
else:
threads = []
nextpage = ceil(len(allPageList)/(10*threadNum))
for index in range(0,threadNum):
threads.append(threading.Thread(target=getPageListUrl,args=(pageList[nextpage*index*10:nextpage*(index+1)*10],)))
for thd in threads:
thd.setDaemon(True)
thd.start()
for thd in threads:
thd.join()
resultList = []
def getPageListUrl(pageList):
global resultList
for index in pageList:
for index2 in getUrl(index):
print index2
resultList.append(index2)
#把每页采集到的URL写入文件
def putFile(allUrl):
if os.path.exists("url.txt"):
print u"己删除历史记录,将重新生成采集url...."
os.remove("url.txt")
for index in allUrl:
file = open("url.txt","a+")
file.write(index+"\n")
file.close()
#多线程跑起来
if __name__ == '__main__':
if len(sys.argv) == 4:
list = getPage(str(sys.argv[1]),int(sys.argv[2]),int(sys.argv[3]))
st = ctime()
creMultiThread(list[0],list[1])
result = set(resultList)
print u'''
-----------------------------------------------------
多线程百度URL采集器 python:2.7.10
-----------------------------------------------------
^v^ example:geturl.pyc "关键字" "页数" "线程数" ^v^
^v^ team:www.secbox.cn ^v^
codier_qq:510623849
-----------------------------------------------------
'''
putFile(result)
print u"开始时间:"+st
print u"总共采集"+str(len(resultList))+u"条url........."
print u"去除重复剩余"+str(len(result))+u"条url........."
print u"采集结果已经自动保存当前目录 url.txt 文件中..... ^V^"
print u"结束时间:"+ctime()
elif len(sys.argv)
print u'''
-----------------------------------------------------
多线程百度URL采集器 python:2.7.10
-----------------------------------------------------
^v^ example:geturl.pyc "关键字" "页数" "线程数" ^v^
^v^ team:www.secbox.cn ^v^
codier_qq:510623849
-----------------------------------------------------
'''