想用python3内置的urllib库爬一下新闻、文章,结果报错
代码如下:
import urllib.request
import re
url="https://www.csdn.net/"
header=("User-Agent", "Mozilla/5.0")
opr=urllib.request.build_opener()
opr.addheaders=[header]
data=opr.open(url).read()
data=data.decode("utf-8")
pat='<a href="(https://blog.csdn.net/.*?/article/details/.*?)"'
allurl=re.compile(pat).findall(data)
'''print(allurl)'''
for i in range(0,len(allurl)):
try:
print("这是爬取的第{}个".format(i))
thisurl=allurl[i]
file="//Users//liuyuan//Desktop//1234//"+str(i)+".html"
urllib.request.urlretrieve(thisurl,file)
print("----成功----")
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
运行上述