Python 爬虫
--python 爬取fuchsia中文社区提供网站上面的源码等数据,仅供参考--
import os
import sys
import re
import urllib
import urllib.request
import urllib.response
import urllib.error
import ssl
from bs4 import BeautifulSoup
import html5lib
from collections import deque
quene = deque()
url = "https://mirrors.hexang.com/fuchsia/"
cur_path = os.path.abspath('.') + "\\fuchisa"
types = []
def add_type(str):
if str not in types:
types.append(str)
return types
quene.append(url)
while quene:
url_ = quene.popleft()
print("url open : ",url_)
try:
op = urllib.request.urlopen(url_)
except urllib.error.HTTPError as e:
print(" 404 Not found ")
except urllib.error.URLError as e:
print(e.code)
finally:
print("others exception")
# 找不到的直接pass掉
if op.status != 200:
continue
data = op.read()
soup = BeautifulSoup(data,"html5lib")
href = ''
for href in soup.find_all('a'):
text = href.get_text()
if text != "../":
url_1 = url_+ href.get_text()
if url_1:
quene.append(url_1)
#input()
split_str = url_.split('/')
index = split_str.index("fuchsia")
split_str = split_str[index + 1:len(split_str)]
sub_modules = []
for i in split_str:
if i !='':
sub_modules.append(i)
else:
pass
print("sub_moudles : ",sub_modules)
if len(sub_modules) != 0:
sub_path = ''
for i in sub_modules:
sub_path += i
sub_path += "\\"
print(sub_path)
else:
sub_path = ''
print("sub_path : ",sub_path)
tp = op.getheader("Content-type")
print(tp)
types = add_type(tp)
print("types : ",types)
#这里可以根据不同的数据类型进行不同数据处理
if "html" in op.getheader("Content-type"):
path = cur_path +"\\"+ sub_path
print("mkdir : ",path)
if not os.path.exists(path):
os.makedirs(path)
else:
sub_path = sub_path[:-1]
abs_path = cur_path + "\\" + sub_path
print("save file ",abs_path)
with open(abs_path,"wb") as f:
f.write(data)
f.close()
数据爬取下来是不完整的,编译不了