使用XPath的爬虫

爬取某贴吧

import requests
from  lxml import etree
import json


class Tieba(object):
    def __init__(self,tieba_name):
        self.tieba_name=tieba_name 
        self.headers={
        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1"

        }
    def get_url_list(self):
        url = "https://tieba.baidu.com/f?kw="+self.tieba_name+"&ie=utf-8&pn={}&"
        #response=request.get(url,heards=heards)
        url_list=[]
        for i in range(100): #通过循环拼接100个url
            url_list.append(url.format(i*50))
        return url_list #返回100个url的urllist
    
    def parse_url(self,url):
        '''一个发送请求,获取响应,同时etree处理html'''
        print("parsing url:",url)
        response = requests.get(url,headers=self.headers) #发送请求
        html = response.content.decode() #获取html字符串
        html = etree.HTML(html) #获取element 类型的html
        return html


    
    def get_title_href(self,url):
        '''获取一个页面的title和href'''
        html = self.parse_url(url)
        li_temp_list = html.xpath("//li[@class='tl_shadow']") #分组,按照li标签分组
        total_items = []
        for i in li_temp_list: #遍历分组
            href = "https:"+i.xpath("./a/@href")[0] if len(i.xpath("./a/@href"))>0 else None
            text = i.xpath("./a/div[1]/span[1]/text()")
            text = text[0] if len(text)>0 else None
            item = dict(  #放入字典
                href = href,
                text = text
            )
            total_items.append(item)
        return total_items #返回一个页面所有的item
    def get_img(self,url):
        '''获取一个帖子里面的所有图片'''
        html = self.parse_url(url) #返回elemet累心的html,具有xpath方法
        img_list = html.xpath('//div[@data-class="BDE_Image"]/@data-url')
        img_list = [i.split("src=")[-1] for i in img_list] #提取图片的url
        img_list = [requests.utils.unquote(i) for i in img_list]
        return img_list
 
    def save_item(self,item):
        '''保存一个item'''
        with open("teibatupian.txt","a") as f:
            f.write(json.dumps(item,ensure_ascii=False,indent=2))
            f.write("\n")
 
    def run(self):
        #1、找到了url规律,url list
        url_list = self.get_url_list()
        for url in url_list:
        #2、遍历urllist 发送请求,获得响应,etree处理html
        # 3、提取title,href
            total_item = self.get_title_href(url)
            for item in total_item:
                href = item["href"]
                img_list = self.get_img(href) #获取到了帖子的图片列表
                item["img"] = img_list
                # 4、保存到本地
                print(item)
                self.save_item(item)

if __name__ == "__main__":
    tieba = Tieba("图书")
    tieba.run()

爬取糗事百科的段子

import requests
from retrying import retry
from lxml import etree
 
class Qiubai_spider():
    def __init__(self):
        self.url = "http://www.qiushibaike.com/8hr/page/{}/"
        self.headers = {
            "User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"
        }
 
    @retry(stop_max_attempt_number=5) #调用retry,当assert出错时候,重复请求5次
    def parse_url(self,url):
        response = requests.get(url,timeout=10,headers=self.headers) #请求url
        assert response.status_code==200  #当响应码不是200时候,做断言报错处理
        print(url)
        return etree.HTML(response.text) #返回etree之后的html
 
    def parse_content(self,html):
        item_temp = html.xpath("//div[@class='article block untagged mb15']")
        print(len(item_temp))
        for item in item_temp:
            #获取用户头像地址
            avatar = item.xpath("./div[1]/a[1]/img/@src")[0] if len(item.xpath("./div[1]/a[1]/img/@src"))>0 else None
            #为头像地址添加前缀
            if avatar is not None and not avatar.startswith("http:"):
                avatar = "http:"+avatar
            print(avatar)
            name = item.xpath("./div[1]/a[2]/h2/text()")[0] #获取用户名
            print(name)
            content = item.xpath("./a[@class='contentHerf']/div/span/text()")[0] #获取内容
            print(content)
            star_number = item.xpath("./div[@class='stats']/span[1]/i/text()")[0] #获取点赞数
            print(star_number)
            comment_number = item.xpath("./div[@class='stats']/span[2]/a/i/text()")[0] #获取评论数
            print(comment_number)
            print("*"*100)
 
    def run(self):
        '''函数的主要逻辑实现
        '''
        url = self.url.format(1) #获取到url
        html = self.parse_url(url) #请求url
        self.parse_content(html) #解析页面内容并把内容存入内容队列
 
if __name__ == "__main__":
    qiubai = Qiubai_spider()
    qiubai.run(
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值