第一步,读取单页面:
import urllib.request
url = "http://www.badtom.cn"
data = urllib.request.urlopen(url).read()
data = data.decode('UTF-8')
print(data)
第二步,对前篇单机版伪代码的简单实现:
from collections import deque
import re
import urllib.request
queue = deque()
visited = set()
init_url = "http://www.badtom.cn"
queue.append(init_url)
visited.add(init_url)
count = 0
while queue:
url = queue.popleft()
print('已经抓取:' + str(count) + '个,正在抓取-->' + url)
count += 1
try:
urlop = urllib.request.urlopen(url,timeout = 2)
data = urlop.read().decode('utf-8')
# print(data)
except:
continue
linkre = re.compile('href="(.+?)"')
linkdata = linkre.findall(data)
for next_url in linkdata:
if 'http' in next_url and next_url not in visited:
queue.append(next_url)
visited.add(next_url)
第三步,伪装成火狐浏览器,并将爬取的页面存到磁盘上:
from collections import deque
import re
import urllib.request
#存储爬到的网页
def saveToFile(filePath,data):
with open(filePath,'w',encoding='utf-8') as fileop:
fileop.write(data)
queue = deque()
visited = set()
init_url = "http://www.badtom.cn"
queue.append(init_url)
visited.add(init_url)
#通过头信息伪装成火狐浏览器
headinfo = {
'Connection': 'Keep-Alive',
'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
filePath = 'E:/spider/'
count = 0
while queue:
url = queue.popleft()
print('已经抓取:' + str(count) + '个,正在抓取-->' + url)
count += 1
try:
req = urllib.request.Request(url,headers = headinfo)
urlop = urllib.request.urlopen(req,timeout = 2)
data = urlop.read().decode('utf-8')
saveToFile(filePath + str(count) + '.html', data)
except:
continue
linkre = re.compile('href="(.+?)"')
linkdata = linkre.findall(data)
for next_url in linkdata:
if 'http' in next_url and 'github' not in next_url and next_url not in visited:
queue.append(next_url)
visited.add(next_url)