静态网页爬虫实战（一）

最新推荐文章于 2023-12-22 02:49:36 发布

荏苒相忆

最新推荐文章于 2023-12-22 02:49:36 发布

阅读量325

点赞数

分类专栏： python 文章标签：爬虫

本文链接：https://blog.csdn.net/sinat_36151966/article/details/88529557

版权

python 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

以抽屉网为例，爬取该用户评论
在这里插入图片描述

import requests
import urllib
import re
from bs4 import BeautifulSoup


#通过循环实现对不同页码的网页的数据爬取
for page in range(3):#以3页为例
    url = 'https://dig.chouti.com/user/cocolary/comments/'+str(page)  #引文对比每页评论网址，发现发生变化的就只有末尾数字
    headers2 = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                              'Chrome/51.0.2704.63 Safari/537.36'}

    request = urllib.request.Request(url, headers=headers2)   #设置代理服务器
    answer = urllib.request.urlopen(request)
    html_text = answer.read()
    data = html_text.decode('utf-8')

    soup = BeautifulSoup(html_text.decode('utf-8'), 'html.parser')     #解析网页

    # 找出class属性值为content-list的div
    news_list = soup.find('div', {'class': 'content-list'})
    # 找出content-list下的所有div标签
    news = news_list.find_all('div')
    news_time = []
    news_com = []
    news_source = []
    news_kind = []
    news_ding = []
    news_cai = []
    news_title = []
    news_state = []

    # 遍历news
    for i in news:
        try:
            time = i.find('div', {'class': 'comment-time'}).get_text().strip()  # 提取时间
            com = i.find('span', {'class': 'text-comment-con'}).get_text().strip()  # 提取评论
            source = i.find('span', {'class': 'content-source'}).get_text().strip()  # 提取来源
            kind = i.find('span', {'class': 'content-kind'}).get_text().strip()  # 提取来源区
            ding_num = i.find('span', {'class': 'ding-num'}).get_text().strip()  # 提取ding-num
            cai_num = i.find('span', {'class': 'cai-num'}).get_text().strip()  # 提取ding-num
            title = i.find('div', {'class': 'comment-title'}).find('a')  # 提取评论新闻
            state = i.find('div', {'class': 'comment-state'}).find('a')  # 提取评论状态

            # 存储爬取结果
            news_time.append(time)
            news_com.append(com)
            news_source.append(source)
            news_kind.append(kind)
            news_ding.append(ding_num)
            news_cai.append(cai_num)
            news_title.append(title)
            news_state.append(state)

            # 打印输出
            print('评论：', com)
            print('时间：', time)
            print('顶：', ding_num, '踩：', cai_num)
            print('系统来源：', source, '区域来源：', kind)
            print('评论新闻href：', title)
            print('查看回复：', state)
            # print('查看回复：', aa)
            print()
        except AttributeError as e:
            continue

    answer.close()