爬取博客首页，并将所有博客文章写入文件

最新推荐文章于 2023-01-10 11:45:03 发布

Hello_Jandy

最新推荐文章于 2023-01-10 11:45:03 发布

阅读量1.3k

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/weixin_42141853/article/details/80787729

版权

爬虫专栏收录该内容

17 篇文章 0 订阅

订阅专栏

import urllib.request
import re

import time


def getHtml(url):
    headers = ('User-Agent', "Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11")
    # 代理设置
    proxy = urllib.request.ProxyHandler({'http': '118.190.95.26:9001'})  # 字典
    # 基本的urlopen不支持代理、cookie等其他Http/Https高级功能,自定义opener()
    opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)
    opener.addheaders = [headers]  # 列表
    data = urllib.request.urlopen(url).read().decode('utf-8', 'ignore')
    return data

def writeDown(urlLists,fileurl):
    for i in range(1,len(urlLists)):
        try:
            urllib.request.urlretrieve(urlLists[i],fileurl+str(i)+'.html')
        except urllib.error.URLError as e:
            if hasattr(e, 'code'):
                print(e.code)
            if hasattr(e, 'reason'):
                print(e.reason)
                
fileurl='C:/Users/Administrator/Desktop/pretice/blogs/'
blogurl='https://blog.csdn.net/'
data=getHtml(blogurl)
pat='href="(https://blog.csdn.net/.*?)"'
urlLists=re.findall(pat,data)
print(urlLists)
print(len(urlLists))
print(urlLists[6])
writeDown(urlLists,fileurl)