Python入门学习之：10分钟1500访问量

最新推荐文章于 2023-01-30 01:19:48 发布

叨陪鲤

最新推荐文章于 2023-01-30 01:19:48 发布

阅读量4.4k

点赞数

分类专栏： python爬虫文章标签： CSDN访问量访问量刷刷刷

本文链接：https://blog.csdn.net/s2603898260/article/details/114647268

版权

python爬虫专栏收录该内容

11 篇文章 7 订阅

订阅专栏

看效果：

在这里插入图片描述

不扯没用的，直接上代码：

# author   : sunzd
# date     : 2019/9/01
# position : beijing

from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from urllib import request
from urllib import error
import re
import time


def html_request(url):
    if url is None:
        return
    print("download html is :{0}".format(url))
    # 如果url包含中文，则需要进行编码

    # 模拟浏览器行为
    headers = {'UserAgent': str(UserAgent().random)}
    req = request.Request(url, headers=headers)

    try:
        html = request.urlopen(req).read().decode('utf-8')
    except error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
        return None
    # print(html)
    return html


def html_parser(url, html):
    if url is None or html is None:
        return
    # pattern = '<main>(.+?)</main>'   #因为<main>后紧跟的时‘\n’因此需要忽略掉使用模式修正符re.S使'.'可以匹配任意字符
    # articles = re.compile(pattern, re.S).findall(html)
    # articles = articles[0]

    pattern_art = '<div class="article-item-box csdn-tracking-statistics" data(.+?)</div>'
    # print(articles)
    articles = re.compile(pattern_art, re.S).findall(html.replace('\n', ''))
    print(articles.__len__())
    for article in articles:
        soup = BeautifulSoup(article, 'html.parser')
        title = soup.find('a', attrs={'target': '_blank'})
        # print(title)
        print(
            "文章题目:{0}\n文章类型:{1}".format(title.text.replace(' ', '').replace("原", "").replace("转", ""), title.span.text))
        print("文章链接:{0}".format(title.attrs['href']))
        html_request(title.attrs['href'])
        infors = soup.find('div', attrs={'class': 'info-box d-flex align-content-center'})
        # for infor in infors.p.next_siblings:   next_siblings : 因为不包括自己，因此会把第一个p节点信息去掉。
        # for infor in infors.children:
        #     if infor == ' ':  # ‘ ’空格也会识别为他的孩子，因此需要过滤掉
        #         continue
        #     # print("======{0}".format(infor))
        #     if infor.span:  # 只需要<span >节点的信息
        #         print("{0}".format(infor.span.text))

    pattern_next = '<li class="js-page-next js-page-action ui-pager ui-pager-disabled">'
    next = re.compile(pattern_next).findall(html)
    # print(html)
    print("是否为最后一页:{0}----{1}".format(len(next), next))
    if len(next) == 0:
        return 0
    else:
        return 0


if __name__ == '__main__':
    name = '你自己的名称'
    page = 1
    url = "https://blog.csdn.net/" + name + "/article/list/" + str(page) + '?'
    while page < 7:
        html = html_request(url)
        # print(html)
        next = html_parser(url, html)
        page += 1
        if page > 6:
            page = 1
        url = "https://blog.csdn.net/" + name + "/article/list/" + str(page) + '?'