【有待理解和修改的代码】协程爬取学校网页

小小岛风

于 2023-04-16 15:29:58 发布

阅读量137

点赞数

文章标签： python 爬虫

本文链接：https://blog.csdn.net/weixin_58973212/article/details/130183148

版权

该代码示例展示了如何使用Python的requests库进行网络请求，结合lxml和BeautifulSoup解析HTML，利用gevent进行并发处理，从指定网页抓取用户信息、图片链接、内容、点赞数和评论数，并将数据保存到本地文件中。

摘要由CSDN通过智能技术生成

原参考代码

import requests
from queue import Queue
from lxml import etree
import gevent
import time

class Spider(object):
    def __init__(self):
        self.headers = {"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"}
        self.base_url = "http://www.glutnn.cn/list.aspx?s=1&ClassID=110101&Curpage=1"
        # 创建保存数据的队列
        self.data_queue = Queue()
        # 统计数量
        self.count = 0

    def send_request(self, url):
        print("[INFO]: 正在抓取" + url)
        html = requests.get(url, headers=self.headers).content
        # 每次请求间隔1秒
        time.sleep(1)
        self.parse_page(html)

    def parse_page(self, html):
        html_obj = etree.HTML(html)
        node_list = html_obj.xpath('//div[contains(@id,"qiushi_tag")]')
        for node in node_list:
            # 用户名
            username = node.xpath('./div')[0].xpath(".//h2")[0].text
            # 图片链接
            image = node.xpath('.//div[@class="thumb"]//@src')
            # 取出标签下的内容：段子内容
            content = node.xpath('.//div[@class="content"]/span')[0].text
            # 点赞,取出标签里包含的内容
            zan = node.xpath('.//i')[0].text
            # 评论
            comments = node.xpath('.//i')[1].text
            items = {
                "username": username,
                "content": content,
                "image": image,
                "zan": zan,
                "comments": comments
            }
            self.count += 1
            self.data_queue.put(items)

    def start_work(self):
        job_list = []
        for page in range(1, 14):
            # 创建一个协程任务对象
            url = self.base_url + str(page) + "/"
            job = gevent.spawn(self.send_request, url)
            # 保存所有的协程任务
            job_list.append(job)
        # joinall()接收一个列表，将列表里的所有协程任务添加到任务队列里执行
        gevent.joinall(job_list)
        local_file = open("duanzi.txt", "wb+")
        while not self.data_queue.empty():
            content = self.data_queue.get()
            result = str(content).encode("utf-8")
            local_file.write(result + b'\n')
        local_file.close()
        print(self.count)

if __name__ == "__main__":
    spider = Spider()
    spider.start_work()

    # 性能分析
    # spider = Spider()
    # start = time.time()
    # spider.start_work()
    # print("[INFO]: Useing time %f secend" % (time.time() - start))

我的拼接的代码

import requests
from queue import Queue
from lxml import etree
import gevent
import time
from bs4 import BeautifulSoup


class Spider(object):
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 '
                          'Safari/537.36 Edg/111.0.1661.62'}
        self.url = "http://www.glutnn.cn/list.aspx?s=1&ClassID=110101&Curpage=1"
        # 创建保存数据的队列
        self.data_queue = Queue()
        # 统计数量
        self.count = 0

    def send_request(self, url):
        print("[INFO]: 正在抓取" + url)
        html = requests.get(url, headers=self.headers).content
        # 每次请求间隔1秒
        time.sleep(1)
        self.parse_page(html)

    def parse_page(self, html):
        response = requests.get(url=self.url, headers=self.headers)
        response.encoding = 'utf-8'
        html = etree.HTML(response.text)
        # 保存网页→html文件
        open('work.html', 'w', encoding='utf-8').write(response.text)
        soup = BeautifulSoup(open('work.html', encoding='utf-8'), features='html.parser')
        # 获取所有 class_="td_list" 的 <td></td>
        for title in soup.find_all('td', class_="td_list"):
            # <td>标签下第一个<a>标签的文本的值
            print(title.a.text)

    def start_work(self):
        job_list = []
        for page in range(1, 14):
            # 创建一个协程任务对象
            url = self.url + str(page) + "/"
            job = gevent.spawn(self.send_request, url)
            # 保存所有的协程任务
            job_list.append(job)
        # joinall()接收一个列表，将列表里的所有协程任务添加到任务队列里执行
        gevent.joinall(job_list)
        local_file = open("duanzi.txt", "wb+")
        while not self.data_queue.empty():
            content = self.data_queue.get()
            result = str(content).encode("utf-8")
            local_file.write(result + b'\n')
        local_file.close()
        print(self.count)


if __name__ == "__main__":
    spider = Spider()
    spider.start_work()

    # 性能分析
    # spider = Spider()
    # start = time.time()
    # spider.start_work()
    # print("[INFO]: Useing time %f secend" % (time.time() - start))