import urllib.request
import re
import time
def getHtml(url):
headers = ('User-Agent', "Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11")
# 代理设置
proxy = urllib.request.ProxyHandler({'http': '118.190.95.26:9001'}) # 字典
# 基本的urlopen不支持代理、cookie等其他Http/Https高级功能,自定义opener()
opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
opener.addheaders = [headers] # 列表
data = urllib.request.urlopen(url).read().decode('utf-8', 'ignore')
return data
def writeDown(urlLists,fileurl):
for i in range(1,len(urlLists)):
try:
urllib.request.urlretrieve(urlLists[i],fileurl+str(i)+'.html')
except urllib.error.URLError as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, 'reason'):
print(e.reason)
fileurl='C:/Users/Administrator/Desktop/pretice/blogs/'
blogurl='https://blog.csdn.net/'
data=getHtml(blogurl)
pat='href="(https://blog.csdn.net/.*?)"'
urlLists=re.findall(pat,data)
print(urlLists)
print(len(urlLists))
print(urlLists[6])
writeDown(urlLists,fileurl)
爬取博客首页,并将所有博客文章写入文件
最新推荐文章于 2023-01-10 11:45:03 发布