python爬虫,百度贴吧(巴塞罗那)

coding=utf-8

“”"
author:lei
function:
“”"

import requests
from lxml import etree
import json

class TieBaBaSa(object):

def __init__(self, name):
    self.url = "https://tieba.baidu.com/f?ie=utf-8&kw={}".format(name)
    self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"}

def get_data(self, url):
    response = requests.get(url, headers=self.headers).content.decode()

    return response.replace("<!--", "").replace("-->", "")

def parse_data(self, data):
    html = etree.HTML(data)
    el_list = html.xpath('''//ul[@id='thread_list']//li//div[@class="col2_right j_threadlist_li_right"] 

‘’’)
if len(el_list) == 0:
el_list = html.xpath(’’’//ul[@id=‘thread_list’]//li//div[@class="col2_right j_threadlist_li_right "]’’’)

    print(el_list)

    data_list = []

    for el in el_list:
        temp = {}
        temp["title"] = el.xpath("./div/div/a/text()")[0]
        temp["writer"] = el.xpath("./div/div[2]/span/@title")[0]
        # temp["content"] = el.xpath("./div/div/div[2]/text()")
        # print(temp)
        data_list.append(temp)

    next_url = "https:" + html.xpath("//a[contains(text(), '下一页>')]/@href")[0]

    return data_list, next_url

def save_data(self, data_list):
    with open("tieba.json", "a", encoding="utf-8") as f:
        f.write(json.dumps(data_list, ensure_ascii=False))
        print("保存成功!")

def run(self):
    next_url = self.url

    while True:
        data = self.get_data(next_url)
        data_list, next_url = self.parse_data(data)
        self.save_data(data_list)
        if next_url is None:
            break

if name == ‘main’:
tieba = TieBaBaSa(“巴塞罗那”)
tieba.run()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值