爬虫实践(1)获取所有博客

1. code

import requests 
from lxml import html
from lxml import etree
from lxml.etree import ParserError
from time import sleep
import xlwt
import time
import re
#把取出的数据放到数据框中
from pandas import DataFrame

class MyBlog:

    def __init__(self, blogNm):
        headers = {"User-Agent":"Python-urllib/2.6"}
        self.session = requests.Session()
        self.session.headers.update(headers)
        self.blogNm = blogNm
    
    def get_categorys(self):
        res = self.session.get(f"https://blog.csdn.net/{self.blogNm}")
        
        category_list = []
        #from lxml import html
        '''
        try:
            assert res.status_code == 200
            dom = html.document_fromstring(res.text)
        except AssertionError:
            raise AssertionError("get categorys failed.")
        except ParserError as e:
            raise ParserError(e)
        category_urls = dom.xpath('//*[@id="asideCategory"]//*[@class="clearfix"]/@href')
        category_names = dom.xpath('//*[@id="asideCategory"]//*[@class="title oneline"]/span/text()')
        '''  
        '''
        #from lxml import etree
        root = etree.HTML(res.text)
        category_urls = root.xpath('//*[@id="asideCategory"]//*[@class="clearfix"]/@href')
        category_names = root.xpath('//*[@id="asideCategory"]//*[@class="title oneline"]/span/text()')

        for index, url in enumerate(category_urls, 0):
            temp_dict = {"categoryName":category_names[index].strip(), "url":url}
            category_list.append(temp_dict)
        print(category_names)
        print(category_list)
                    #把取出的数据放到数据框中,方便处理
        category_info = DataFrame([category_names, category_urls]).T
                    #设置表头
        category_info.columns=["title","url"]
        print(category_info)
                    #展示前几条数据
        print(category_info.head(10))
                    #保存到本地
        category_info.to_csv("category_info.csv")
        '''
        #正儿表达式匹配
        url_pattern = '<a class="clearfix" target="_blank" href="(.*?)"'
        category_urls = re.findall(url_pattern, res.text)
        print(category_urls)
        name_pattern = '<span class="text">(.*?)</span>'
        category_names = re.findall(name_pattern, res.text)
        print(category_names)
        return category_list
    
    def get_blog_with_category(self, category_url):
        blog_list = []
        count = 0
        res = self.session.get(category_url)
        try:
            assert res.status_code == 200
            dom = html.document_fromstring(res.text)
        except AssertionError:
            raise AssertionError("get category detail failed.")
        except ParserError as e:
            raise ParserError(e)
        blog_urls = dom.xpath('//*[@class="column_article_list"]//a/@href')
        blog_names = dom.xpath('//*[@class="column_article_list"]//*[@class="column_article_title"]//h2/text()')
        blog_names_update = []
        for name in blog_names:
            name = name.strip()
            if name:
                blog_names_update.append(name)
        try:
            for index, url in enumerate(blog_urls, 0):
                temp_dict = {"blogName":blog_names_update[index], "url":url}
                blog_list.append(temp_dict)
                count +=1
        except IndexError:
            raise IndexError("Name and URL are inconsistent.")
        
        blogs_dict = {"count":count,
                      "blogs":blog_list}
        return blogs_dict
    
    def get_all_blogs(self):
        blogs = []
        count = 0
        categorys = self.get_categorys()
        for category in categorys:
            blog_dict = self.get_blog_with_category(category["url"])
            temp_dict = {"categoryName":category["categoryName"],
                         "elements":blog_dict
                         }
            blogs.append(temp_dict)
            count += blog_dict["count"]
            sleep(1)
        blogs_dict = {"blogs":blogs,
                      "count":count}
        return blogs_dict
    
def set_style(name,height,bold=False):
    style = xlwt.XFStyle()  # 初始化样式

    font = xlwt.Font()  # 为样式创建字体
    font.name = name # 'Times New Roman'
    font.bold = bold
    font.color_index = 4
    font.height = height

    # borders= xlwt.Borders()
    # borders.left= 6
    # borders.right= 6
    # borders.top= 6
    # borders.bottom= 6

    style.font = font
    # style.borders = borders

    return style

def write_excel(blogs):
    file_name = "blog_" + time.strftime("%Y%m%d%H%M%S") +".xls"
    #创建文件
    f = xlwt.Workbook()
    #创建sheet
    sheet1 = f.add_sheet(u'blog',cell_overwrite_ok=True)
    row0 = [u'分类',u'博客',u'url']
    for i in range(0,len(row0)):
        sheet1.write(0,i,row0[i],set_style('Times New Roman',220,True))
    
    #写入内容
    row = 1
    for category in blogs["blogs"]:
        categoryName = category["categoryName"]
        for i in range(category["elements"]["count"]):
            blogName = category["elements"]["blogs"][i]["blogName"]
            blogUrl = category["elements"]["blogs"][i]["url"]
            info = [categoryName, blogName, blogUrl]
            for j in range(len(row0)):
                sheet1.write(row, j, info[j])
            row +=1
    #保存excel
    f.save(file_name)
    
if __name__ == "__main__":
    my = MyBlog("shitou987")
    my.get_categorys()
#     blogs_dict = my.get_all_blogs()
#     print(blogs_dict)
#     write_excel(blogs_dict)

# 爬取多页信息
# 1.找翻页规律,构造url (url中可能包含页码)
# 2.通过循环爬取多页信息

2. 数据结构

{
    'blogs': [{
            'categoryName': 'python',
            'elements': {
                'count': 39,
                'blogs': [{
                        'blogName': 'Python多线程&进程&协程',
                        'url': 'https://blog.csdn.net/shitou987/article/details/108460536'
                    }, {
                        'blogName': 'python jsonschema使用',
                        'url': 'https://blog.csdn.net/shitou987/article/details/107582352'
                    }, {
                        'blogName': '列表排序',
                        'url': 'https://blog.csdn.net/shitou987/article/details/107548825'
                    }, {
                        'blogName': '算法——递归',
                 ...
    ],
    'count': 88
}

3. 写入excel

在这里插入图片描述

4. DataFrame

在这里插入图片描述
保存为csv文件
在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值