import requests
from lxml import etree
import pymongo
class QiushiSpider:
def __init__(self):
self.url = "https://www.qiushibaike.com/text/page/8/"
# 定义爬取的url
self.headers = {"User-Agent":"Mozilla5.0/"}
self.conn = pymongo.MongoClient("localhost",27017)
# 存入mongo数据库中
self.db = self.conn.Baike
self.myset = self.db.baikeset1
def getPage(self):
res = requests.get(self.url,headers=self.headers)
res.enconding="utf-8"
html = res.text
self.parsePage(html)
def parsePage(self,html):
parstHtml = etree.HTML(html)
# 基准的xpath
base_list = parstHtml.xpath('//div[contains(@id,"qiushi_tag_")]')
# 遍历每个段子的节点对象
for children in base_list:
# 用户昵称
username = children.xpath('./div/a/h2')[0].text.strip()
# 段子内容
content = children.xpath('./a/div[@class="content"]/span')[0].text.strip()
# 好笑数量
laughf_num = children.xpath(".//span/i")[0].text.strip()
# 评论数量
ping_num = children.xpath('.//i[@class="number"]')[1].text.strip()
d={"username":username,
"content":content,
"laughf_num":laughf_num,
"ping_num":ping_num
}
self.myset.insert(d)
if __name__ =="__main__":
q= QiushiSpider()
q.getPage()
python爬取糗事百科
最新推荐文章于 2024-10-10 19:29:49 发布