<span style="font-size:18px;"><span style="font-size:18px;"><span style="font-size:18px;">import re
import urllib.request
import urllib
from collections import deque
import crawler2
queue = deque()
visited = set()
url = 'http://www.baidu.com'
oper = crawler2.makeMyOpener()
queue.append(url)
cnt = 1
while queue:
url = queue.popleft()
visited |= {url} # 标记为已访问
print('已经抓取第: ' , cnt, '个链接',' 当前链接: ' + url)
uop = oper.open(url, timeout=5)
if 'html' not in uop.getheader('Content-Type'):
continue
try:
data = uop.read()
crawler2.saveFile(data)
data = data.decode(encoding='UTF-8')
print('save the :',cnt,' data')
except:
continue
cnt += 1
linkre = re.compile('href=\"(.+?)\"')
try:
for x in linkre.findall(data):
if 'http' in x and '.com' in x and 'baidu' not in x and x not in visited:
queue.append(x)
print('把 ' + x +'加入队列')
except:
continue</span></span></span>
下面是crawler2.py模块的内容,由于有几个函数,所以就以模块的形式保存了:
<span style="font-size:18px;">import urllib.request
import http.cookiejar
# head: dict of header
def makeMyOpener(head = {
'Connection': 'Keep-Alive',
'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.163.400 QQBrowser/9.3.7175.400'
}):
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
header = []
for key, value in head.items():
elem = (key, value)
header.append(elem)
opener.addheaders = header
return opener
def saveFile(data):
save_path = 'D:\\output.out'
f_obj = open(save_path,'ab')
f_obj.write(data)
f_obj.close()
</span>
基于Python3环境,如果不停止的话,程序会不停的工作,不停的把爬到页面以二进制形式追加保存:
欢迎留言讨论!