爬虫实例2：Python学习文章爬取

最新推荐文章于 2024-04-06 02:15:54 发布

dianepure

最新推荐文章于 2024-04-06 02:15:54 发布

阅读量447

点赞数

分类专栏： Python

本文链接：https://blog.csdn.net/dianepure/article/details/88344202

版权

Python 专栏收录该内容

13 篇文章 0 订阅

订阅专栏

发现了一个教Python大佬的网站：https://cuiqingcai.com/category/technique，于是利用从上面学的知识，把这个网站技术杂谈栏目下的文章爬了一下...

主要使用的模块：

requests、  BeautifulSoup、  Workbook.openpyxl、 time

源码：

from openpyxl import Workbook
from bs4 import BeautifulSoup
import requests
import time
time1 = time.time()


def down_url():
    head = []
    head_url = []
    head_user = []
    head_clock = []
    head_eye = []
    head_comments = []
    for i in range(1, 20):
        print('正在抓取第' + str(i) + "页.................................")
        url = 'https://cuiqingcai.com/category/technique/python/page/' + str(i)
        header = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/47.0.2526.80 Safari/537.36 '
        }
        res = requests.get(url, headers=header).content
        soup = BeautifulSoup(res, "html.parser")
        div = soup.find('div', class_="content")
        for article in div.find_all('article'):
            head.append(article.h2.a['title'])
            head_url.append(article.h2.a['href'])
            auth = article.find('p', class_="auth-span").find_all('span', class_="muted")
            for auth_list in auth:  # 写入文章作者相关信息
                if auth.index(auth_list) == 0:
                    user = auth_list.get_text()
                    head_user.append(user)
                elif auth.index(auth_list) == 1:
                    clock = auth_list.get_text()
                    head_clock.append(clock)
                elif auth.index(auth_list) == 2:
                    eye = auth_list.get_text()
                    head_eye.append(eye)
                elif auth.index(auth_list) == 3:
                    comments = auth_list.get_text()
                    head_comments.append(comments)

    time2 = time.time()
    print('总共耗时 %s s' % round((time2 - time1), 2))
    return head, head_url, head_user, head_clock, head_eye, head_comments


def workbook(head, head_url, head_user, head_clock, head_eye, head_comments):
    wb = Workbook()
    ws1 = wb.active
    ws1['A1'] = 'name'
    ws1['B1'] = 'url'
    ws1['C1'] = 'user'
    ws1['D1'] = 'clock'
    ws1['E1'] = 'eye'
    ws1['F1'] = 'comments'
    for i in range(len(head)):
        ws1['A%s' % (i + 2)] = head[i]
        ws1['B%s' % (i + 2)] = head_url[i]
        ws1['C%s' % (i + 2)] = head_user[i]
        ws1['D%s' % (i + 2)] = head_clock[i]
        ws1['E%s' % (i + 2)] = head_eye[i]
        ws1['F%s' % (i + 2)] = head_comments[i]
    wb.save(filename='Python网址.xlsx')


head, head_url, head_user, head_clock, head_eye, head_comments = down_url()
workbook(head, head_url, head_user, head_clock, head_eye, head_comments)