Python爬虫学习：案例-主题列表内容抓取（6）

最新推荐文章于 2023-01-08 20:14:14 发布

南淮北安

最新推荐文章于 2023-01-08 20:14:14 发布

阅读量494

点赞数 1

分类专栏： Python 爬虫学习文章标签：爬虫案例 python

本文链接：https://blog.csdn.net/nanhuaibeian/article/details/86644582

版权

Python 爬虫学习专栏收录该内容

58 篇文章 29 订阅

订阅专栏

审查元素
分析
功能代码

      #获取标题和URL的方法
    def extract_tag_a(self,columns,index):
        title = columns[index].xpath('a')[0].text
        url = columns[index].xpath('a')[0].attrib['href']

        return title,url
    #获取评分、like数量、回复数量
        def extract_text(self,columns,index):
        tt = columns[index].text
        #如果tt为None时需要将其替换为0
        if tt == None:
            tt = 0
        return tt
	#获取主题列表内容
    def get_post_list(self):
        rows = self.tree.xpath("//table[@class='board-list tiz']/tbody/tr")
        posts = []
        for row in rows:
            post = {}
            columns = row.xpath('td')
            # 获取文章标题和文章URL的方法
            post['title'], post['url'] = self.extract_tag_a(columns,1)
            # 获取作者姓名和作者URL的方法
            post['author_id'],post['author_url'] = self.extract_tag_a(columns,3)
            #获取评分
            post['rating'] = self.extract_text(columns,4)
            # 获取Like数量
            post['num_likes'] = self.extract_text(columns,5)
            # 获取回复数量
            post['num_replies'] = self.extract_text(columns,6)
            posts.append(post)
        return posts

主要代码

import re
import requests
from lxml import etree

class PostListCrawler:
    domain = "https://www.newsmth.net"

    def get_content(self,board_url,page):
        querystring = {"ajax": "", "p": str(page)}
        url = self.domain + board_url
        r = requests.get(url,params=querystring)
        #方便调用
        self.html = r.text
        self.tree = etree.HTML(r.text)
	#获取最大页码
    def get_max_page(self):
        tree = etree.HTML(self.html)
        pages = tree.xpath('//ol[@class="page-main"][1]/li')
        #只有一页的情况
        if len(pages) == 1:
            return 1
        #页面没有在最后一页时最大的页码
        last_page_test = pages[len(pages)-1].xpath('a')[0].text
        #如果页面在最后一页
        if last_page_test == '>>':
            return int(pages[len(pages)-2].xpath('a')[0].text)

        return last_page_test

    #获取标题和URL的方法
    def extract_tag_a(self,columns,index):
        title = columns[index].xpath('a')[0].text
        url = columns[index].xpath('a')[0].attrib['href']

        return title,url
    #获取评分、like数量、回复数量
    def extract_text(self,columns,index):
        tt = columns[index].text
        #如果tt为None时需要将其替换为0
        if tt == None:
            tt = 0
        return tt

    def get_post_list(self):
        rows = self.tree.xpath("//table[@class='board-list tiz']/tbody/tr")
        posts = []
        for row in rows:
            post = {}
            columns = row.xpath('td')
            # 获取文章标题和文章URL的方法
            post['title'], post['url'] = self.extract_tag_a(columns,1)
            # 获取作者姓名和作者URL的方法
            post['author_id'],post['author_url'] = self.extract_tag_a(columns,3)
            #获取评分
            post['rating'] = self.extract_text(columns,4)
            # 获取Like数量
            post['num_likes'] = self.extract_text(columns,5)
            # 获取回复数量
            post['num_replies'] = self.extract_text(columns,6)
            posts.append(post)
        return posts

if __name__ == "__main__":
    plc = PostListCrawler()
    content = plc.get_content('/nForum/board/AutoWorld',1)
    print(plc.get_post_list())

南淮北安

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
Python爬虫学习：案例-主题列表内容抓取（6）

审查元素分析功能代码 #获取标题和URL的方法 def extract_tag_a(self,columns,index): title = columns[index].xpath('a')[0].text url = columns[index].xpath('a')[0].attrib['href'] retu...
复制链接

扫一扫