python爬取糗事百科

最新推荐文章于 2024-10-10 19:29:49 发布

lgc_

最新推荐文章于 2024-10-10 19:29:49 发布

阅读量181

点赞数

分类专栏：爬虫文章标签：爬虫

本文链接：https://blog.csdn.net/oGuangCai/article/details/82700718

版权

爬虫专栏收录该内容

4 篇文章 0 订阅

订阅专栏

import requests
from lxml import etree
import pymongo

class QiushiSpider:
    def __init__(self):
        self.url = "https://www.qiushibaike.com/text/page/8/"
#        定义爬取的url
        self.headers = {"User-Agent":"Mozilla5.0/"}
        self.conn = pymongo.MongoClient("localhost",27017)
#        存入mongo数据库中
        self.db = self.conn.Baike
        self.myset = self.db.baikeset1

    def getPage(self):
        res = requests.get(self.url,headers=self.headers)
        res.enconding="utf-8"
        html = res.text
        self.parsePage(html)

    def parsePage(self,html):
        parstHtml = etree.HTML(html)
#        基准的xpath
        base_list = parstHtml.xpath('//div[contains(@id,"qiushi_tag_")]')
#        遍历每个段子的节点对象
        for children in base_list:
#            用户昵称
            username = children.xpath('./div/a/h2')[0].text.strip()
#            段子内容
            content = children.xpath('./a/div[@class="content"]/span')[0].text.strip()
#            好笑数量
            laughf_num = children.xpath(".//span/i")[0].text.strip() 
#            评论数量
            ping_num = children.xpath('.//i[@class="number"]')[1].text.strip()
            d={"username":username,
               "content":content,
               "laughf_num":laughf_num,
               "ping_num":ping_num
                    }
            self.myset.insert(d)

if __name__ =="__main__":
    q= QiushiSpider()
    q.getPage()