python爬虫,百度贴吧

coding=utf-8

“”"
author:lei
function:
“”"

import requests
from lxml import etree

class TieBa(object):

def __init__(self, name):
    self.url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn=0".format(name)
    self.headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"
        # "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0; DigExt)"
    }

def get_data(self, url):
    response = requests.get(url, headers=self.headers)
    # with open("temp.html", "wb") as f:
    #     f.write(response.content)
    return response.content

def parse_data(self, data):
    # 创建elements对象
    data = data.decode().replace("<!--", "").replace("-->", "")
    html = etree.HTML(data)
    # print(data)
    el_list = html.xpath('''//ul[@id='thread_list']//li//div[@class="threadlist_title pull_left j_th_tit "]/a''')
    print(len(el_list))

    data_list = []

    for el in el_list:
        temp = {}
        temp['title'] = el.xpath("./@title")[0]  # 列表
        temp['link'] = "https://tieba.baidu.com/" + el.xpath("./@href")[0]
        data_list.append(temp)

    # 获取下一页url
    try:
        # next_url = "https:" + html.xpath("//a[@class='next pagination-item ']/@href")[0]
        next_url = "https:" + html.xpath("//a[contains(text(), '下一页>')]/@href")[0]
    except:
        next_url = None

    return data_list, next_url

def save_data(self, data_list):
    for data in data_list:
        print(data)

def run(self):
    # url
    # headers

    next_url = self.url

    while True:
        # 发送请求,获取响应
        data = self.get_data(next_url)
        # 从响应中提取数据(数据和翻译)
        data_list, next_url = self.parse_data(data)

        self.save_data(data_list)
        print(next_url)

        # 判断是否终结
        if next_url == None:
            break

if name == ‘main’:
tieba = TieBa(“传智播客”)
tieba.run()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值