Python爬虫——实战：爬取博客园指定信息

最新推荐文章于 2024-08-06 17:47:02 发布

企鹅家的北极熊

最新推荐文章于 2024-08-06 17:47:02 发布

阅读量964

点赞数

分类专栏： Python高级——爬虫

本文链接：https://blog.csdn.net/sinat_42247418/article/details/121599572

版权

开发语言 python 爬虫

Python高级——爬虫专栏收录该内容

3 篇文章 0 订阅

订阅专栏

Python爬虫——实战：爬取博客园指定信息

用正则表达式提取数据

用正则表达式提取数据

# 课程内容：爬虫实战博客园
# 开发时间： 16:25
import requests
import re
def get_one_page(url,page):
    headers = {}
    html_txt = ''
    headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
    response = requests.get('http://www.baidu.com', headers=headers)
    if response.status_code == 200:
        html_txt = response.text
    return html_txt

def parse_one_page(html_txt):
    html_str = html_txt.replace('\n','')
    pat_all = r'<div class="day">(.*?)编辑'
    blogs = re.findall(pat_all,html_str)
    print(blogs)
    for blog in blogs:
        posdate = ''
        title = ''
        match = re.findall(r'<span>(.*?)</span>',blog)
        if len(match) == 1:
            title = match[0]
        com_count,read_count,digg_count = re.findall(r'\((\d+)\)',blog)
    return (title,read_count,com_count,digg_count)

if __name__=='__main__':
    base_url = 'https://www.cnblogs.com/pinard/default.html?page='
    urls = [base_url+str(i) for i in range(1,15)]
    #获取分页连接
    for urls in urls:
        #1、想获取一页的内容
        html_txt = get_one_page(url)
        #2、解析每一页的内容
        cont = parse_one_page(html_txt)
        #3、保存提取数据
        save2txt(cont)

    #4、重复步骤1,2
    #5、保存提取的数据